diff --git a/projects/AlignDETR/README.md b/projects/AlignDETR/README.md deleted file mode 100644 index 33690fe0c..000000000 --- a/projects/AlignDETR/README.md +++ /dev/null @@ -1,33 +0,0 @@ -# AlignDETR - -> [Align-DETR: Improving DETR with Simple IoU-aware BCE loss](https://arxiv.org/abs/2304.07527) - - - -## Abstract - -DETR has set up a simple end-to-end pipeline for object detection by formulating this task as a set prediction problem, showing promising potential. However, despite the significant progress in improving DETR, this paper identifies a problem of misalignment in the output distribution, which prevents the best-regressed samples from being assigned with high confidence, hindering the model's accuracy. We propose a metric, recall of best-regressed samples, to quantitively evaluate the misalignment problem. Observing its importance, we propose a novel Align-DETR that incorporates a localization precision-aware classification loss in optimization. The proposed loss, IA-BCE, guides the training of DETR to build a strong correlation between classification score and localization precision. We also adopt the mixed-matching strategy, to facilitate DETR-based detectors with faster training convergence while keeping an end-to-end scheme. Moreover, to overcome the dramatic decrease in sample quality induced by the sparsity of queries, we introduce a prime sample weighting mechanism to suppress the interference of unimportant samples. Extensive experiments are conducted with very competitive results reported. In particular, it delivers a 46 (+3.8)% AP on the DAB-DETR baseline with the ResNet-50 backbone and reaches a new SOTA performance of 50.2% AP in the 1x setting on the COCO validation set when employing the strong baseline DINO. - -![image](https://github.com/open-mmlab/mmdetection/assets/33146359/5a4fa664-b4c6-487d-b6d8-22be9d59a2bc) - -## Results and Models - -| Backbone | Model | Lr schd | box AP | Config | Download | -| :------: | :---------: | :-----: | :----: | :------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| R-50 | DINO-4scale | 12e | 50.5 | [config](./align_detr-4scale_r50_8xb2-12e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/align_detr/align_detr-4scale_r50_8xb2-12e_coco/align_detr-4scale_r50_8xb2-12e_coco_20230914_095734-61f921af.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/align_detr/align_detr-4scale_r50_8xb2-12e_coco/align_detr-4scale_r50_8xb2-12e_coco_20230914_095734.log.json) | -| R-50 | DINO-4scale | 24e | 51.4 | [config](./align_detr-4scale_r50_8xb2-24e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/align_detr/align_detr-4scale_r50_8xb2-24e_coco/align_detr-4scale_r50_8xb2-24e_coco_20230919_152414-f4b6cf76.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/align_detr/align_detr-4scale_r50_8xb2-24e_coco/align_detr-4scale_r50_8xb2-24e_coco_20230919_152414.log.json) | - -## Citation - -We provide the config files for AlignDETR: [Align-DETR: Improving DETR with Simple IoU-aware BCE loss](https://arxiv.org/abs/2304.07527). - -```latex -@misc{cai2023aligndetr, - title={Align-DETR: Improving DETR with Simple IoU-aware BCE loss}, - author={Zhi Cai and Songtao Liu and Guodong Wang and Zheng Ge and Xiangyu Zhang and Di Huang}, - year={2023}, - eprint={2304.07527}, - archivePrefix={arXiv}, - primaryClass={cs.CV} -} -``` diff --git a/projects/AlignDETR/align_detr/__init__.py b/projects/AlignDETR/align_detr/__init__.py deleted file mode 100644 index 26a49b524..000000000 --- a/projects/AlignDETR/align_detr/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from .align_detr_head import AlignDETRHead -from .mixed_hungarian_assigner import MixedHungarianAssigner - -__all__ = ['AlignDETRHead', 'MixedHungarianAssigner'] diff --git a/projects/AlignDETR/align_detr/align_detr_head.py b/projects/AlignDETR/align_detr/align_detr_head.py deleted file mode 100644 index c06d1bd40..000000000 --- a/projects/AlignDETR/align_detr/align_detr_head.py +++ /dev/null @@ -1,508 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Dict, List, Tuple, Union - -import torch -from mmengine.structures import InstanceData -from torch import Tensor - -from mmdet.models.dense_heads import DINOHead -from mmdet.registry import MODELS -from mmdet.structures.bbox import (bbox_cxcywh_to_xyxy, bbox_overlaps, - bbox_xyxy_to_cxcywh) -from mmdet.utils import InstanceList -from .utils import KeysRecorder - - -@MODELS.register_module() -class AlignDETRHead(DINOHead): - r"""Head of the Align-DETR: Improving DETR with Simple IoU-aware BCE loss - - Code is modified from the `official github repo - `_. - - More details can be found in the `paper - `_ . - - Args: - all_layers_num_gt_repeat List[int]: Number to repeat gt for 1-to-k - matching between ground truth and predictions of each decoder - layer. Only used for matching queries, not for denoising queries. - Element count is `num_pred_layer`. If `as_two_stage` is True, then - the last element is for encoder output, and the others for - decoder layers. Otherwise, all elements are for decoder layers. - Defaults to a list of `1` for the last decoder layer and `2` for - the others. - alpha (float): Hyper-parameter of classification loss that controls - the proportion of each item to calculate `t`, the weighted - geometric average of the confident score and the IoU score, to - align classification and regression scores. Defaults to `0.25`. - gamma (float): Hyper-parameter of classification loss to do the hard - negative mining. Defaults to `2.0`. - tau (float): Hyper-parameter of classification and regression losses, - it is the temperature controlling the sharpness of the function - to calculate positive sample weight. Defaults to `1.5`. - """ - - def __init__(self, - *args, - all_layers_num_gt_repeat: List[int] = None, - alpha: float = 0.25, - gamma: float = 2.0, - tau: float = 1.5, - **kwargs) -> None: - self.all_layers_num_gt_repeat = all_layers_num_gt_repeat - self.alpha = alpha - self.gamma = gamma - self.tau = tau - self.weight_table = torch.zeros( - len(all_layers_num_gt_repeat), max(all_layers_num_gt_repeat)) - for layer_index, num_gt_repeat in enumerate(all_layers_num_gt_repeat): - self.weight_table[layer_index][:num_gt_repeat] = torch.exp( - -torch.arange(num_gt_repeat) / tau) - - super().__init__(*args, **kwargs) - assert len(self.all_layers_num_gt_repeat) == self.num_pred_layer - - def loss_by_feat(self, all_layers_cls_scores: Tensor, *args, - **kwargs) -> Any: - """Loss function. - AlignDETR: This method is based on `DINOHead.loss_by_feat`. - - Args: - all_layers_cls_scores (Tensor): Classification scores of all - decoder layers, has shape (num_decoder_layers, bs, - num_queries_total, cls_out_channels), where - `num_queries_total` is the sum of `num_denoising_queries` - and `num_matching_queries`. - Returns: - dict[str, Tensor]: A dictionary of loss components. - """ - # Wrap `all_layers_cls_scores` with KeysRecorder to record its - # `__getitem__` keys and get decoder layer index. - all_layers_cls_scores = KeysRecorder(all_layers_cls_scores) - result = super(AlignDETRHead, - self).loss_by_feat(all_layers_cls_scores, *args, - **kwargs) - return result - - def loss_by_feat_single(self, cls_scores: Union[KeysRecorder, Tensor], - bbox_preds: Tensor, - batch_gt_instances: InstanceList, - batch_img_metas: List[dict]) -> Tuple[Tensor]: - """Loss function for outputs from a single decoder layer of a single - feature level. - AlignDETR: This method is based on `DINOHead.loss_by_feat_single`. - - Args: - cls_scores (Union[KeysRecorder, Tensor]): Box score logits from a - single decoder layer for all images, has shape (bs, - num_queries, cls_out_channels). - bbox_preds (Tensor): Sigmoid outputs from a single decoder layer - for all images, with normalized coordinate (cx, cy, w, h) and - shape (bs, num_queries, 4). - batch_gt_instances (list[:obj:`InstanceData`]): Batch of - gt_instance. It usually includes ``bboxes`` and ``labels`` - attributes. - batch_img_metas (list[dict]): Meta information of each image, e.g., - image size, scaling factor, etc. - - Returns: - Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and - `loss_iou`. - """ - # AlignDETR: Get layer_index. - if isinstance(cls_scores, KeysRecorder): - # Outputs are from decoder layer. Get layer_index from - # `__getitem__` keys history. - keys = [key for key in cls_scores.keys if isinstance(key, int)] - assert len(keys) == 1, \ - 'Failed to extract key from cls_scores.keys: {}'.format(keys) - layer_index = keys[0] - # Get dn_cls_scores tensor. - cls_scores = cls_scores.obj - else: - # Outputs are from encoder layer. - layer_index = self.num_pred_layer - 1 - - for img_meta in batch_img_metas: - img_meta['layer_index'] = layer_index - - results = super(AlignDETRHead, self).loss_by_feat_single( - cls_scores, - bbox_preds, - batch_gt_instances=batch_gt_instances, - batch_img_metas=batch_img_metas) - return results - - def get_targets(self, cls_scores_list: List[Tensor], - bbox_preds_list: List[Tensor], - batch_gt_instances: InstanceList, - batch_img_metas: List[dict]) -> tuple: - """Compute regression and classification targets for a batch image. - - Outputs from a single decoder layer of a single feature level are used. - AlignDETR: This method is based on `DETRHead.get_targets`. - - Args: - cls_scores_list (list[Tensor]): Box score logits from a single - decoder layer for each image, has shape [num_queries, - cls_out_channels]. - bbox_preds_list (list[Tensor]): Sigmoid outputs from a single - decoder layer for each image, with normalized coordinate - (cx, cy, w, h) and shape [num_queries, 4]. - batch_gt_instances (list[:obj:`InstanceData`]): Batch of - gt_instance. It usually includes ``bboxes`` and ``labels`` - attributes. - batch_img_metas (list[dict]): Meta information of each image, e.g., - image size, scaling factor, etc. - - Returns: - tuple: a tuple containing the following targets. - - - labels_list (list[Tensor]): Labels for all images. - - label_weights_list (list[Tensor]): Label weights for all images. - - bbox_targets_list (list[Tensor]): BBox targets for all images. - - bbox_weights_list (list[Tensor]): BBox weights for all images. - - num_total_pos (int): Number of positive samples in all images. - - num_total_neg (int): Number of negative samples in all images. - """ - results = super(AlignDETRHead, - self).get_targets(cls_scores_list, bbox_preds_list, - batch_gt_instances, batch_img_metas) - - # AlignDETR: `num_total_pos` for matching queries is the number of - # unique gt bboxes in the batch. Refer to AlignDETR official code: - # https://github.com/FelixCaae/AlignDETR/blob/8c2b1806026e1b33fe1c282577de1647e352d7f0/aligndetr/criterions/base_criterion.py#L195C15-L195C15 # noqa: E501 - num_total_pos = sum( - len(gt_instances) for gt_instances in batch_gt_instances) - - results = list(results) - results[-2] = num_total_pos - return tuple(results) - - def _get_targets_single(self, cls_score: Tensor, bbox_pred: Tensor, - gt_instances: InstanceData, - img_meta: dict) -> tuple: - """Compute regression and classification targets for one image. - - Outputs from a single decoder layer of a single feature level are used. - AlignDETR: This method is based on `DETRHead._get_targets_single`. - - Args: - cls_score (Tensor): Box score logits from a single decoder layer - for one image. Shape [num_queries, cls_out_channels]. - bbox_pred (Tensor): Sigmoid outputs from a single decoder layer - for one image, with normalized coordinate (cx, cy, w, h) and - shape [num_queries, 4]. - gt_instances (:obj:`InstanceData`): Ground truth of instance - annotations. It should includes ``bboxes`` and ``labels`` - attributes. - img_meta (dict): Meta information for one image. - layer_index (int): Decoder layer index for the outputs. Defaults - to `-1`. - - Returns: - tuple[Tensor]: a tuple containing the following for one image. - - - labels (Tensor): Labels of each image. - - label_weights (Tensor]): Label weights of each image. - - bbox_targets (Tensor): BBox targets of each image. - - bbox_weights (Tensor): BBox weights of each image. - - pos_inds (Tensor): Sampled positive indices for each image. - - neg_inds (Tensor): Sampled negative indices for each image. - """ - img_h, img_w = img_meta['img_shape'] - factor = bbox_pred.new_tensor([img_w, img_h, img_w, - img_h]).unsqueeze(0) - # convert bbox_pred from xywh, normalized to xyxy, unnormalized - bbox_pred = bbox_cxcywh_to_xyxy(bbox_pred) - bbox_pred = bbox_pred * factor - - pred_instances = InstanceData(scores=cls_score, bboxes=bbox_pred) - - # assigner and sampler - # AlignDETR: Get `k` of current layer. - layer_index = img_meta['layer_index'] - num_gt_repeat = self.all_layers_num_gt_repeat[layer_index] - assign_result = self.assigner.assign( - pred_instances=pred_instances, - gt_instances=gt_instances, - img_meta=img_meta, - k=num_gt_repeat) - - gt_bboxes = gt_instances.bboxes - gt_labels = gt_instances.labels - pos_inds = torch.nonzero( - assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique() - neg_inds = torch.nonzero( - assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique() - pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1 - pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds.long(), :] - - # AlignDETR: Get label targets, label weights, and bbox weights. - target_results = self._get_align_detr_targets_single( - cls_score, - bbox_pred, - gt_labels, - pos_gt_bboxes, - pos_inds, - pos_assigned_gt_inds, - layer_index, - is_matching_queries=True) - - label_targets, label_weights, bbox_weights = target_results - - # bbox targets - bbox_targets = torch.zeros_like(bbox_pred, dtype=gt_bboxes.dtype) - - # DETR regress the relative position of boxes (cxcywh) in the image. - # Thus the learning target should be normalized by the image size, also - # the box format should be converted from defaultly x1y1x2y2 to cxcywh. - pos_gt_bboxes_normalized = pos_gt_bboxes / factor - pos_gt_bboxes_targets = bbox_xyxy_to_cxcywh(pos_gt_bboxes_normalized) - bbox_targets[pos_inds] = pos_gt_bboxes_targets - return (label_targets, label_weights, bbox_targets, bbox_weights, - pos_inds, neg_inds) - - def _loss_dn_single(self, dn_cls_scores: KeysRecorder, - dn_bbox_preds: Tensor, - batch_gt_instances: InstanceList, - batch_img_metas: List[dict], - dn_meta: Dict[str, int]) -> Tuple[Tensor]: - """Denoising loss for outputs from a single decoder layer. - AlignDETR: This method is based on `DINOHead._loss_dn_single`. - - Args: - dn_cls_scores (KeysRecorder): Classification scores of a single - decoder layer in denoising part, has shape (bs, - num_denoising_queries, cls_out_channels). - dn_bbox_preds (Tensor): Regression outputs of a single decoder - layer in denoising part. Each is a 4D-tensor with normalized - coordinate format (cx, cy, w, h) and has shape - (bs, num_denoising_queries, 4). - batch_gt_instances (list[:obj:`InstanceData`]): Batch of - gt_instance. It usually includes ``bboxes`` and ``labels`` - attributes. - batch_img_metas (list[dict]): Meta information of each image, e.g., - image size, scaling factor, etc. - dn_meta (Dict[str, int]): The dictionary saves information about - group collation, including 'num_denoising_queries' and - 'num_denoising_groups'. It will be used for split outputs of - denoising and matching parts and loss calculation. - - Returns: - Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and - `loss_iou`. - """ - # AlignDETR: Get dn_cls_scores tensor. - dn_cls_scores = dn_cls_scores.obj - - # AlignDETR: Add layer outputs to meta info because they are not - # variables of method `_get_dn_targets_single`. - for image_index, img_meta in enumerate(batch_img_metas): - img_meta['dn_cls_score'] = dn_cls_scores[image_index] - img_meta['dn_bbox_pred'] = dn_bbox_preds[image_index] - - results = super()._loss_dn_single(dn_cls_scores, dn_bbox_preds, - batch_gt_instances, batch_img_metas, - dn_meta) - return results - - def _get_dn_targets_single(self, gt_instances: InstanceData, - img_meta: dict, dn_meta: Dict[str, - int]) -> tuple: - """Get targets in denoising part for one image. - AlignDETR: This method is based on - `DINOHead._get_dn_targets_single`. - and 1) Added passing `dn_cls_score`, `dn_bbox_pred` to this - method; 2) Modified the way to get targets. - Args: - dn_cls_score (Tensor): Box score logits from a single decoder - layer in denoising part for one image, has shape - [num_denoising_queries, cls_out_channels]. - dn_bbox_pred (Tensor): Sigmoid outputs from a single decoder - layer in denoising part for one image, with - normalized coordinate (cx, cy, w, h) and shape - [num_denoising_queries, 4]. - gt_instances (:obj:`InstanceData`): Ground truth of instance - annotations. It should includes ``bboxes`` and ``labels`` - attributes. - img_meta (dict): Meta information for one image. - dn_meta (Dict[str, int]): The dictionary saves information about - group collation, including 'num_denoising_queries' and - 'num_denoising_groups'. It will be used for split outputs of - denoising and matching parts and loss calculation. - - Returns: - tuple[Tensor]: a tuple containing the following for one image. - - - labels (Tensor): Labels of each image. - - label_weights (Tensor]): Label weights of each image. - - bbox_targets (Tensor): BBox targets of each image. - - bbox_weights (Tensor): BBox weights of each image. - - pos_inds (Tensor): Sampled positive indices for each image. - - neg_inds (Tensor): Sampled negative indices for each image. - """ - gt_bboxes = gt_instances.bboxes - gt_labels = gt_instances.labels - num_groups = dn_meta['num_denoising_groups'] - num_denoising_queries = dn_meta['num_denoising_queries'] - num_queries_each_group = int(num_denoising_queries / num_groups) - device = gt_bboxes.device - - if len(gt_labels) > 0: - t = torch.arange(len(gt_labels), dtype=torch.long, device=device) - t = t.unsqueeze(0).repeat(num_groups, 1) - pos_assigned_gt_inds = t.flatten() - pos_inds = torch.arange( - num_groups, dtype=torch.long, device=device) - pos_inds = pos_inds.unsqueeze(1) * num_queries_each_group + t - pos_inds = pos_inds.flatten() - else: - pos_inds = pos_assigned_gt_inds = \ - gt_bboxes.new_tensor([], dtype=torch.long) - - neg_inds = pos_inds + num_queries_each_group // 2 - - # AlignDETR: Get meta info and layer outputs. - img_h, img_w = img_meta['img_shape'] - dn_cls_score = img_meta['dn_cls_score'] - dn_bbox_pred = img_meta['dn_bbox_pred'] - factor = dn_bbox_pred.new_tensor([img_w, img_h, img_w, - img_h]).unsqueeze(0) - - # AlignDETR: Convert dn_bbox_pred from xywh, normalized to xyxy, - # unnormalized. - dn_bbox_pred = bbox_cxcywh_to_xyxy(dn_bbox_pred) - dn_bbox_pred = dn_bbox_pred * factor - - # AlignDETR: Get label targets, label weights, and bbox weights. - target_results = self._get_align_detr_targets_single( - dn_cls_score, dn_bbox_pred, gt_labels, - gt_bboxes.repeat([num_groups, 1]), pos_inds, pos_assigned_gt_inds) - - label_targets, label_weights, bbox_weights = target_results - - # bbox targets - bbox_targets = torch.zeros(num_denoising_queries, 4, device=device) - - # DETR regress the relative position of boxes (cxcywh) in the image. - # Thus the learning target should be normalized by the image size, also - # the box format should be converted from defaultly x1y1x2y2 to cxcywh. - gt_bboxes_normalized = gt_bboxes / factor - gt_bboxes_targets = bbox_xyxy_to_cxcywh(gt_bboxes_normalized) - bbox_targets[pos_inds] = gt_bboxes_targets.repeat([num_groups, 1]) - - return (label_targets, label_weights, bbox_targets, bbox_weights, - pos_inds, neg_inds) - - def _get_align_detr_targets_single(self, - cls_score: Tensor, - bbox_pred: Tensor, - gt_labels: Tensor, - pos_gt_bboxes: Tensor, - pos_inds: Tensor, - pos_assigned_gt_inds: Tensor, - layer_index: int = -1, - is_matching_queries: bool = False): - '''AlignDETR: Get label targets, label weights, and bbox weights based - on `t`, the weighted geometric average of the confident score and - the IoU score, to align classification and regression scores. - - Args: - cls_score (Tensor): Box score logits from the last encoder layer - or a single decoder layer for one image. Shape - [num_queries or num_denoising_queries, cls_out_channels]. - bbox_pred (Tensor): Sigmoid outputs from the last encoder layer - or a single decoder layer for one image, with unnormalized - coordinate (x, y, x, y) and shape - [num_queries or num_denoising_queries, 4]. - gt_labels (Tensor): Ground truth classification labels for one - image, has shape [num_gt]. - pos_gt_bboxes (Tensor): Positive ground truth bboxes for one - image, with unnormalized coordinate (x, y, x, y) and shape - [num_positive, 4]. - pos_inds (Tensor): Positive prediction box indices, has shape - [num_positive]. - pos_assigned_gt_inds Tensor: Positive ground truth box indices, - has shape [num_positive]. - layer_index (int): decoder layer index for the outputs. Defaults - to `-1`. - is_matching_queries (bool): The outputs are from matching - queries or denoising queries. Defaults to `False`. - - Returns: - tuple[Tensor]: a tuple containing the following for one image. - - - label_targets (Tensor): Labels of one image. Shape - [num_queries or num_denoising_queries, cls_out_channels]. - - label_weights (Tensor): Label weights of one image. Shape - [num_queries or num_denoising_queries, cls_out_channels]. - - bbox_weights (Tensor): BBox weights of one image. Shape - [num_queries or num_denoising_queries, 4]. - ''' - - # Classification loss - # = 1 * BCE(prob, t * rank_weights) for positive sample; - # = prob**gamma * BCE(prob, 0) for negative sample. - # That is, - # label_targets = 0 for negative sample; - # = t * rank_weights for positive sample. - # label_weights = pred**gamma for negative sample; - # = 1 for positive sample. - cls_prob = cls_score.sigmoid() - label_targets = torch.zeros_like( - cls_score, device=pos_gt_bboxes.device) - label_weights = cls_prob**self.gamma - - bbox_weights = torch.zeros_like(bbox_pred, dtype=pos_gt_bboxes.dtype) - - if len(pos_inds) == 0: - return label_targets, label_weights, bbox_weights - - pos_cls_score_inds = (pos_inds, gt_labels[pos_assigned_gt_inds]) - iou_scores = bbox_overlaps( - bbox_pred[pos_inds], pos_gt_bboxes, is_aligned=True) - - # t (Tensor): The weighted geometric average of the confident score - # and the IoU score, to align classification and regression scores. - # Shape [num_positive]. - t = ( - cls_prob[pos_cls_score_inds]**self.alpha * - iou_scores**(1 - self.alpha)) - t = torch.clamp(t, 0.01).detach() - - # Calculate rank_weights for matching queries. - if is_matching_queries: - # rank_weights (Tensor): Weights of each group of predictions - # assigned to the same positive gt bbox. Shape [num_positive]. - rank_weights = torch.zeros_like(t, dtype=self.weight_table.dtype) - - assert 0 <= layer_index < len(self.weight_table), layer_index - rank_to_weight = self.weight_table[layer_index].to( - rank_weights.device) - unique_gt_inds = torch.unique(pos_assigned_gt_inds) - - # For each positive gt bbox, get all predictions assigned to it, - # then calculate rank weights for this group of predictions. - for gt_index in unique_gt_inds: - pred_group_cond = pos_assigned_gt_inds == gt_index - # Weights are based on their rank sorted by t in the group. - pred_group = t[pred_group_cond] - indices = pred_group.sort(descending=True)[1] - group_weights = torch.zeros_like( - indices, dtype=self.weight_table.dtype) - group_weights[indices] = rank_to_weight[:len(indices)] - rank_weights[pred_group_cond] = group_weights - - t = t * rank_weights - pos_bbox_weights = rank_weights.unsqueeze(-1).repeat( - 1, bbox_pred.size(-1)) - bbox_weights[pos_inds] = pos_bbox_weights - else: - bbox_weights[pos_inds] = 1.0 - - label_targets[pos_cls_score_inds] = t - label_weights[pos_cls_score_inds] = 1.0 - - return label_targets, label_weights, bbox_weights diff --git a/projects/AlignDETR/align_detr/mixed_hungarian_assigner.py b/projects/AlignDETR/align_detr/mixed_hungarian_assigner.py deleted file mode 100644 index cc31b5e6a..000000000 --- a/projects/AlignDETR/align_detr/mixed_hungarian_assigner.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from typing import List, Optional, Union - -import torch -from mmengine import ConfigDict -from mmengine.structures import InstanceData -from scipy.optimize import linear_sum_assignment -from torch import Tensor - -from mmdet.models.task_modules import AssignResult, BaseAssigner -from mmdet.registry import TASK_UTILS - - -@TASK_UTILS.register_module() -class MixedHungarianAssigner(BaseAssigner): - """Computes 1-to-k matching between ground truth and predictions. - - This class computes an assignment between the targets and the predictions - based on the costs. The costs are weighted sum of some components. - For DETR the costs are weighted sum of classification cost, regression L1 - cost and regression iou cost. The targets don't include the no_object, so - generally there are more predictions than targets. After the 1-to-k - gt-pred matching, the un-matched are treated as backgrounds. Thus - each query prediction will be assigned with `0` or a positive integer - indicating the ground truth index: - - - 0: negative sample, no assigned gt - - positive integer: positive sample, index (1-based) of assigned gt - - Args: - match_costs (:obj:`ConfigDict` or dict or \ - List[Union[:obj:`ConfigDict`, dict]]): Match cost configs. - """ - - def __init__( - self, match_costs: Union[List[Union[dict, ConfigDict]], dict, - ConfigDict] - ) -> None: - - if isinstance(match_costs, dict): - match_costs = [match_costs] - elif isinstance(match_costs, list): - assert len(match_costs) > 0, \ - 'match_costs must not be a empty list.' - - self.match_costs = [ - TASK_UTILS.build(match_cost) for match_cost in match_costs - ] - - def assign(self, - pred_instances: InstanceData, - gt_instances: InstanceData, - img_meta: Optional[dict] = None, - k: int = 1, - **kwargs) -> AssignResult: - """Computes 1-to-k gt-pred matching based on the weighted costs. - - This method assign each query prediction to a ground truth or - background. The `assigned_gt_inds` with -1 means don't care, - 0 means negative sample, and positive number is the index (1-based) - of assigned gt. - The assignment is done in the following steps, the order matters. - - 1. Assign every prediction to -1. - 2. Compute the weighted costs, each cost has shape - (num_preds, num_gts). - 3. Update k according to num_preds and num_gts, then repeat - costs k times to shape: (num_preds, k * num_gts), so that each - gt will match k predictions. - 4. Do Hungarian matching on CPU based on the costs. - 5. Assign all to 0 (background) first, then for each matched pair - between predictions and gts, treat this prediction as foreground - and assign the corresponding gt index (plus 1) to it. - - Args: - pred_instances (:obj:`InstanceData`): Instances of model - predictions. It includes ``priors``, and the priors can - be anchors or points, or the bboxes predicted by the - previous stage, has shape (n, 4). The bboxes predicted by - the current model or stage will be named ``bboxes``, - ``labels``, and ``scores``, the same as the ``InstanceData`` - in other places. It may includes ``masks``, with shape - (n, h, w) or (n, l). - gt_instances (:obj:`InstanceData`): Ground truth of instance - annotations. It usually includes ``bboxes``, with shape (k, 4), - ``labels``, with shape (k, ) and ``masks``, with shape - (k, h, w) or (k, l). - img_meta (dict): Image information for one image. - - Returns: - :obj:`AssignResult`: The assigned result. - """ - assert isinstance(gt_instances.labels, Tensor) - num_gts, num_preds = len(gt_instances), len(pred_instances) - gt_labels = gt_instances.labels - device = gt_labels.device - - # 1. Assign -1 by default. - assigned_gt_inds = torch.full((num_preds, ), - -1, - dtype=torch.long, - device=device) - assigned_labels = torch.full((num_preds, ), - -1, - dtype=torch.long, - device=device) - - if num_gts == 0 or num_preds == 0: - # No ground truth or boxes, return empty assignment. - if num_gts == 0: - # No ground truth, assign all to background. - assigned_gt_inds[:] = 0 - return AssignResult( - num_gts=num_gts, - gt_inds=assigned_gt_inds, - max_overlaps=None, - labels=assigned_labels) - - # 2. Compute weighted costs. - cost_list = [] - for match_cost in self.match_costs: - cost = match_cost( - pred_instances=pred_instances, - gt_instances=gt_instances, - img_meta=img_meta) - cost_list.append(cost) - cost = torch.stack(cost_list).sum(dim=0) - - # 3. Update k according to num_preds and num_gts, then - # repeat the ground truth k times to perform 1-to-k gt-pred - # matching. For example, if num_preds = 900, num_gts = 3, then - # there are only 3 gt-pred pairs in sum for 1-1 matching. - # However, for 1-k gt-pred matching, if k = 4, then each - # gt is assigned 4 unique predictions, so there would be 12 - # gt-pred pairs in sum. - k = max(1, min(k, num_preds // num_gts)) - cost = cost.repeat(1, k) - - # 4. Do Hungarian matching on CPU using linear_sum_assignment. - cost = cost.detach().cpu() - if linear_sum_assignment is None: - raise ImportError('Please run "pip install scipy" ' - 'to install scipy first.') - - matched_row_inds, matched_col_inds = linear_sum_assignment(cost) - matched_row_inds = torch.from_numpy(matched_row_inds).to(device) - matched_col_inds = torch.from_numpy(matched_col_inds).to(device) - - matched_col_inds = matched_col_inds % num_gts - # 5. Assign backgrounds and foregrounds. - # Assign all indices to backgrounds first. - assigned_gt_inds[:] = 0 - # Assign foregrounds based on matching results. - assigned_gt_inds[matched_row_inds] = matched_col_inds + 1 - assigned_labels[matched_row_inds] = gt_labels[matched_col_inds] - assign_result = AssignResult( - num_gts=k * num_gts, - gt_inds=assigned_gt_inds, - max_overlaps=None, - labels=assigned_labels) - - return assign_result diff --git a/projects/AlignDETR/align_detr/utils.py b/projects/AlignDETR/align_detr/utils.py deleted file mode 100644 index 5a3c17ec5..000000000 --- a/projects/AlignDETR/align_detr/utils.py +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, List, Optional - - -class KeysRecorder: - """Wrap object to record its `__getitem__` keys in the history. - - Args: - obj (object): Any object that supports `__getitem__`. - keys (List): List of keys already recorded. Default to None. - """ - - def __init__(self, obj: Any, keys: Optional[List[Any]] = None) -> None: - self.obj = obj - - if keys is None: - keys = [] - self.keys = keys - - def __getitem__(self, key: Any) -> 'KeysRecorder': - """Wrap method `__getitem__` to record its keys. - - Args: - key: Key that is passed to the object. - - Returns: - result (KeysRecorder): KeysRecorder instance that wraps sub_obj. - """ - sub_obj = self.obj.__getitem__(key) - keys = self.keys.copy() - keys.append(key) - # Create a KeysRecorder instance from the sub_obj. - result = KeysRecorder(sub_obj, keys) - return result diff --git a/projects/AlignDETR/configs/align_detr-4scale_r50_8xb2-12e_coco.py b/projects/AlignDETR/configs/align_detr-4scale_r50_8xb2-12e_coco.py deleted file mode 100644 index 0fe069905..000000000 --- a/projects/AlignDETR/configs/align_detr-4scale_r50_8xb2-12e_coco.py +++ /dev/null @@ -1,185 +0,0 @@ -_base_ = [ - '../../../configs/_base_/datasets/coco_detection.py', - '../../../configs/_base_/default_runtime.py' -] -custom_imports = dict( - imports=['projects.AlignDETR.align_detr'], allow_failed_imports=False) - -model = dict( - type='DINO', - num_queries=900, # num_matching_queries - with_box_refine=True, - as_two_stage=True, - data_preprocessor=dict( - type='DetDataPreprocessor', - mean=[123.675, 116.28, 103.53], - std=[58.395, 57.12, 57.375], - bgr_to_rgb=True, - pad_size_divisor=1), - backbone=dict( - type='ResNet', - depth=50, - num_stages=4, - out_indices=(1, 2, 3), - # AlignDETR: Only freeze stem. - frozen_stages=0, - norm_cfg=dict(type='FrozenBN', requires_grad=False), - norm_eval=True, - style='pytorch', - init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), - neck=dict( - type='ChannelMapper', - in_channels=[512, 1024, 2048], - kernel_size=1, - out_channels=256, - # AlignDETR: Add conv bias. - bias=True, - act_cfg=None, - norm_cfg=dict(type='GN', num_groups=32), - num_outs=4), - encoder=dict( - num_layers=6, - layer_cfg=dict( - self_attn_cfg=dict(embed_dims=256, num_levels=4, - dropout=0.0), # 0.1 for DeformDETR - ffn_cfg=dict( - embed_dims=256, - feedforward_channels=2048, # 1024 for DeformDETR - ffn_drop=0.0))), # 0.1 for DeformDETR - decoder=dict( - num_layers=6, - return_intermediate=True, - layer_cfg=dict( - self_attn_cfg=dict(embed_dims=256, num_heads=8, - dropout=0.0), # 0.1 for DeformDETR - cross_attn_cfg=dict(embed_dims=256, num_levels=4, - dropout=0.0), # 0.1 for DeformDETR - ffn_cfg=dict( - embed_dims=256, - feedforward_channels=2048, # 1024 for DeformDETR - ffn_drop=0.0)), # 0.1 for DeformDETR - post_norm_cfg=None), - positional_encoding=dict( - num_feats=128, - normalize=True, - # AlignDETR: Set offset and temperature the same as DeformDETR. - offset=-0.5, # -0.5 for DeformDETR - temperature=10000), # 10000 for DeformDETR - bbox_head=dict( - type='AlignDETRHead', - # AlignDETR: First 6 elements of `all_layers_num_gt_repeat` are for - # decoder layers' outputs. The last element is for encoder layer. - all_layers_num_gt_repeat=[2, 2, 2, 2, 2, 1, 2], - alpha=0.25, - gamma=2.0, - tau=1.5, - num_classes=80, - sync_cls_avg_factor=True, - loss_cls=dict( - type='CrossEntropyLoss', use_sigmoid=True, - loss_weight=1.0), # 2.0 in DeformDETR - loss_bbox=dict(type='L1Loss', loss_weight=5.0), - loss_iou=dict(type='GIoULoss', loss_weight=2.0)), - dn_cfg=dict( # TODO: Move to model.train_cfg ? - label_noise_scale=0.5, - box_noise_scale=1.0, # 0.4 for DN-DETR - group_cfg=dict(dynamic=True, num_groups=None, - num_dn_queries=100)), # TODO: half num_dn_queries - # training and testing settings - train_cfg=dict( - assigner=dict( - type='MixedHungarianAssigner', - match_costs=[ - dict(type='FocalLossCost', weight=2.0), - dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'), - dict(type='IoUCost', iou_mode='giou', weight=2.0) - ])), - test_cfg=dict(max_per_img=300)) # 100 for DeformDETR - -# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different -# from the default setting in mmdet. -train_pipeline = [ - dict(type='LoadImageFromFile', backend_args=_base_.backend_args), - dict(type='LoadAnnotations', with_bbox=True), - dict(type='RandomFlip', prob=0.5), - dict( - type='RandomChoice', - transforms=[ - [ - dict( - type='RandomChoiceResize', - scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), - (608, 1333), (640, 1333), (672, 1333), (704, 1333), - (736, 1333), (768, 1333), (800, 1333)], - keep_ratio=True) - ], - [ - dict( - type='RandomChoiceResize', - # The radio of all image in train dataset < 7 - # follow the original implement - scales=[(400, 4200), (500, 4200), (600, 4200)], - keep_ratio=True), - dict( - type='RandomCrop', - crop_type='absolute_range', - crop_size=(384, 600), - allow_negative_crop=True), - dict( - type='RandomChoiceResize', - scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), - (608, 1333), (640, 1333), (672, 1333), (704, 1333), - (736, 1333), (768, 1333), (800, 1333)], - keep_ratio=True) - ] - ]), - dict(type='PackDetInputs') -] -train_dataloader = dict( - dataset=dict( - # AlignDETR: Filter empty gt. - filter_cfg=dict(filter_empty_gt=True), - pipeline=train_pipeline)) - -# optimizer -optim_wrapper = dict( - type='OptimWrapper', - optimizer=dict( - type='AdamW', - lr=0.0001, # 0.0002 for DeformDETR - weight_decay=0.0001), - clip_grad=dict(max_norm=0.1, norm_type=2), - paramwise_cfg=dict( - custom_keys={'backbone': dict(lr_mult=0.1)}, - # AlignDETR: No norm decay. - norm_decay_mult=0.0) -) # custom_keys contains sampling_offsets and reference_points in DeformDETR # noqa - -# learning policy -max_epochs = 12 -train_cfg = dict( - type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) - -val_cfg = dict(type='ValLoop') -test_cfg = dict(type='TestLoop') - -param_scheduler = [ - dict( - type='LinearLR', - start_factor=0.0001, - by_epoch=False, - begin=0, - end=2000), - dict( - type='MultiStepLR', - begin=0, - end=max_epochs, - by_epoch=True, - milestones=[11], - gamma=0.1) -] - -# NOTE: `auto_scale_lr` is for automatically scaling LR, -# USER SHOULD NOT CHANGE ITS VALUES. -# base_batch_size = (8 GPUs) x (2 samples per GPU) -auto_scale_lr = dict(base_batch_size=16) diff --git a/projects/AlignDETR/configs/align_detr-4scale_r50_8xb2-24e_coco.py b/projects/AlignDETR/configs/align_detr-4scale_r50_8xb2-24e_coco.py deleted file mode 100644 index f62114ce0..000000000 --- a/projects/AlignDETR/configs/align_detr-4scale_r50_8xb2-24e_coco.py +++ /dev/null @@ -1,19 +0,0 @@ -_base_ = './align_detr-4scale_r50_8xb2-12e_coco.py' -max_epochs = 24 -train_cfg = dict( - type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) -param_scheduler = [ - dict( - type='LinearLR', - start_factor=0.0001, - by_epoch=False, - begin=0, - end=2000), - dict( - type='MultiStepLR', - begin=0, - end=max_epochs, - by_epoch=True, - milestones=[20], - gamma=0.1) -] diff --git a/projects/CO-DETR/README.md b/projects/CO-DETR/README.md deleted file mode 100644 index 787592ade..000000000 --- a/projects/CO-DETR/README.md +++ /dev/null @@ -1,32 +0,0 @@ -# CO-DETR - -> [DETRs with Collaborative Hybrid Assignments Training](https://arxiv.org/abs/2211.12860) - - - -## Abstract - -In this paper, we provide the observation that too few queries assigned as positive samples in DETR with one-to-one set matching leads to sparse supervision on the encoder's output which considerably hurt the discriminative feature learning of the encoder and vice visa for attention learning in the decoder. To alleviate this, we present a novel collaborative hybrid assignments training scheme, namely Co-DETR, to learn more efficient and effective DETR-based detectors from versatile label assignment manners. This new training scheme can easily enhance the encoder's learning ability in end-to-end detectors by training the multiple parallel auxiliary heads supervised by one-to-many label assignments such as ATSS and Faster RCNN. In addition, we conduct extra customized positive queries by extracting the positive coordinates from these auxiliary heads to improve the training efficiency of positive samples in the decoder. In inference, these auxiliary heads are discarded and thus our method introduces no additional parameters and computational cost to the original detector while requiring no hand-crafted non-maximum suppression (NMS). We conduct extensive experiments to evaluate the effectiveness of the proposed approach on DETR variants, including DAB-DETR, Deformable-DETR, and DINO-Deformable-DETR. The state-of-the-art DINO-Deformable-DETR with Swin-L can be improved from 58.5% to 59.5% AP on COCO val. Surprisingly, incorporated with ViT-L backbone, we achieve 66.0% AP on COCO test-dev and 67.9% AP on LVIS val, outperforming previous methods by clear margins with much fewer model sizes. - -
- -
- -## Results and Models - -| Model | Backbone | Epochs | Aug | Dataset | box AP | Config | Download | -| :-------: | :------: | :----: | :--: | :---------------------------: | :----: | :--------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| Co-DINO | R50 | 12 | LSJ | COCO | 52.0 | [config](configs/codino/co_dino_5scale_r50_lsj_8xb2_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/codetr/co_dino_5scale_r50_lsj_8xb2_1x_coco/co_dino_5scale_r50_lsj_8xb2_1x_coco-69a72d67.pth)\\ [log](https://download.openmmlab.com/mmdetection/v3.0/codetr/co_dino_5scale_r50_lsj_8xb2_1x_coco/co_dino_5scale_r50_lsj_8xb2_1x_coco_20230818_150457.json) | -| Co-DINO\* | R50 | 12 | DETR | COCO | 52.1 | [config](configs/codino/co_dino_5scale_r50_8xb2_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/codetr/co_dino_5scale_r50_1x_coco-7481f903.pth) | -| Co-DINO\* | R50 | 36 | LSJ | COCO | 54.8 | [config](configs/codino/co_dino_5scale_r50_lsj_8xb2_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/codetr/co_dino_5scale_lsj_r50_3x_coco-fe5a6829.pth) | -| Co-DINO\* | Swin-L | 12 | DETR | COCO | 58.9 | [config](configs/codino/co_dino_5scale_swin_l_16xb1_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/codetr/co_dino_5scale_swin_large_1x_coco-27c13da4.pth) | -| Co-DINO\* | Swin-L | 12 | LSJ | COCO | 59.3 | [config](configs/codino/co_dino_5scale_swin_l_lsj_16xb1_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/codetr/co_dino_5scale_lsj_swin_large_1x_coco-3af73af2.pth) | -| Co-DINO\* | Swin-L | 36 | DETR | COCO | 60.0 | [config](configs/codino/co_dino_5scale_swin_l_16xb1_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/codetr/co_dino_5scale_swin_large_3x_coco-d7a6d8af.pth) | -| Co-DINO\* | Swin-L | 36 | LSJ | COCO | 60.7 | [config](configs/codino/co_dino_5scale_swin_l_lsj_16xb1_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/codetr/co_dino_5scale_lsj_swin_large_1x_coco-3af73af2.pth) | -| Co-DINO\* | Swin-L | 16 | DETR | Objects365 pre-trained + COCO | 64.1 | [config](configs/codino/co_dino_5scale_swin_l_16xb1_16e_o365tococo.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/codetr/co_dino_5scale_swin_large_16e_o365tococo-614254c9.pth) | - -Note - -- Models labeled * are not trained by us, but from [CO-DETR](https://github.com/Sense-X/Co-DETR) official website. -- We find that the performance is unstable and may fluctuate by about 0.3 mAP. -- If you want to save GPU memory by enabling checkpointing, please use the `pip install fairscale` command. diff --git a/projects/CO-DETR/codetr/__init__.py b/projects/CO-DETR/codetr/__init__.py deleted file mode 100644 index 2ca4c02d9..000000000 --- a/projects/CO-DETR/codetr/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from .co_atss_head import CoATSSHead -from .co_dino_head import CoDINOHead -from .co_roi_head import CoStandardRoIHead -from .codetr import CoDETR -from .transformer import (CoDinoTransformer, DetrTransformerDecoderLayer, - DetrTransformerEncoder, DinoTransformerDecoder) - -__all__ = [ - 'CoDETR', 'CoDinoTransformer', 'DinoTransformerDecoder', 'CoDINOHead', - 'CoATSSHead', 'CoStandardRoIHead', 'DetrTransformerEncoder', - 'DetrTransformerDecoderLayer' -] diff --git a/projects/CO-DETR/codetr/co_atss_head.py b/projects/CO-DETR/codetr/co_atss_head.py deleted file mode 100644 index c6ae0180d..000000000 --- a/projects/CO-DETR/codetr/co_atss_head.py +++ /dev/null @@ -1,153 +0,0 @@ -from typing import List - -import torch -from torch import Tensor - -from mmdet.models.dense_heads import ATSSHead -from mmdet.models.utils import images_to_levels, multi_apply -from mmdet.registry import MODELS -from mmdet.utils import InstanceList, OptInstanceList, reduce_mean - - -@MODELS.register_module() -class CoATSSHead(ATSSHead): - - def loss_by_feat( - self, - cls_scores: List[Tensor], - bbox_preds: List[Tensor], - centernesses: List[Tensor], - batch_gt_instances: InstanceList, - batch_img_metas: List[dict], - batch_gt_instances_ignore: OptInstanceList = None) -> dict: - """Calculate the loss based on the features extracted by the detection - head. - - Args: - cls_scores (list[Tensor]): Box scores for each scale level - Has shape (N, num_anchors * num_classes, H, W) - bbox_preds (list[Tensor]): Box energies / deltas for each scale - level with shape (N, num_anchors * 4, H, W) - centernesses (list[Tensor]): Centerness for each scale - level with shape (N, num_anchors * 1, H, W) - batch_gt_instances (list[:obj:`InstanceData`]): Batch of - gt_instance. It usually includes ``bboxes`` and ``labels`` - attributes. - batch_img_metas (list[dict]): Meta information of each image, e.g., - image size, scaling factor, etc. - batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional): - Batch of gt_instances_ignore. It includes ``bboxes`` attribute - data that is ignored during training and testing. - Defaults to None. - - Returns: - dict[str, Tensor]: A dictionary of loss components. - """ - featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] - assert len(featmap_sizes) == self.prior_generator.num_levels - - device = cls_scores[0].device - anchor_list, valid_flag_list = self.get_anchors( - featmap_sizes, batch_img_metas, device=device) - - cls_reg_targets = self.get_targets( - anchor_list, - valid_flag_list, - batch_gt_instances, - batch_img_metas, - batch_gt_instances_ignore=batch_gt_instances_ignore) - - (anchor_list, labels_list, label_weights_list, bbox_targets_list, - bbox_weights_list, avg_factor, ori_anchors, ori_labels, - ori_bbox_targets) = cls_reg_targets - - avg_factor = reduce_mean( - torch.tensor(avg_factor, dtype=torch.float, device=device)).item() - - losses_cls, losses_bbox, loss_centerness, \ - bbox_avg_factor = multi_apply( - self.loss_by_feat_single, - anchor_list, - cls_scores, - bbox_preds, - centernesses, - labels_list, - label_weights_list, - bbox_targets_list, - avg_factor=avg_factor) - - bbox_avg_factor = sum(bbox_avg_factor) - bbox_avg_factor = reduce_mean(bbox_avg_factor).clamp_(min=1).item() - losses_bbox = list(map(lambda x: x / bbox_avg_factor, losses_bbox)) - - # diff - pos_coords = (ori_anchors, ori_labels, ori_bbox_targets, 'atss') - return dict( - loss_cls=losses_cls, - loss_bbox=losses_bbox, - loss_centerness=loss_centerness, - pos_coords=pos_coords) - - def get_targets(self, - anchor_list: List[List[Tensor]], - valid_flag_list: List[List[Tensor]], - batch_gt_instances: InstanceList, - batch_img_metas: List[dict], - batch_gt_instances_ignore: OptInstanceList = None, - unmap_outputs: bool = True) -> tuple: - """Get targets for ATSS head. - - This method is almost the same as `AnchorHead.get_targets()`. Besides - returning the targets as the parent method does, it also returns the - anchors as the first element of the returned tuple. - """ - num_imgs = len(batch_img_metas) - assert len(anchor_list) == len(valid_flag_list) == num_imgs - - # anchor number of multi levels - num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]] - num_level_anchors_list = [num_level_anchors] * num_imgs - - # concat all level anchors and flags to a single tensor - for i in range(num_imgs): - assert len(anchor_list[i]) == len(valid_flag_list[i]) - anchor_list[i] = torch.cat(anchor_list[i]) - valid_flag_list[i] = torch.cat(valid_flag_list[i]) - - # compute targets for each image - if batch_gt_instances_ignore is None: - batch_gt_instances_ignore = [None] * num_imgs - (all_anchors, all_labels, all_label_weights, all_bbox_targets, - all_bbox_weights, pos_inds_list, neg_inds_list, - sampling_results_list) = multi_apply( - self._get_targets_single, - anchor_list, - valid_flag_list, - num_level_anchors_list, - batch_gt_instances, - batch_img_metas, - batch_gt_instances_ignore, - unmap_outputs=unmap_outputs) - # Get `avg_factor` of all images, which calculate in `SamplingResult`. - # When using sampling method, avg_factor is usually the sum of - # positive and negative priors. When using `PseudoSampler`, - # `avg_factor` is usually equal to the number of positive priors. - avg_factor = sum( - [results.avg_factor for results in sampling_results_list]) - # split targets to a list w.r.t. multiple levels - anchors_list = images_to_levels(all_anchors, num_level_anchors) - labels_list = images_to_levels(all_labels, num_level_anchors) - label_weights_list = images_to_levels(all_label_weights, - num_level_anchors) - bbox_targets_list = images_to_levels(all_bbox_targets, - num_level_anchors) - bbox_weights_list = images_to_levels(all_bbox_weights, - num_level_anchors) - - # diff - ori_anchors = all_anchors - ori_labels = all_labels - ori_bbox_targets = all_bbox_targets - return (anchors_list, labels_list, label_weights_list, - bbox_targets_list, bbox_weights_list, avg_factor, ori_anchors, - ori_labels, ori_bbox_targets) diff --git a/projects/CO-DETR/codetr/co_dino_head.py b/projects/CO-DETR/codetr/co_dino_head.py deleted file mode 100644 index 192acf97d..000000000 --- a/projects/CO-DETR/codetr/co_dino_head.py +++ /dev/null @@ -1,677 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import copy -from typing import List - -import torch -import torch.nn as nn -import torch.nn.functional as F -from mmcv.cnn import Linear -from mmcv.ops import batched_nms -from mmengine.structures import InstanceData -from torch import Tensor - -from mmdet.models import DINOHead -from mmdet.models.layers import CdnQueryGenerator -from mmdet.models.layers.transformer import inverse_sigmoid -from mmdet.models.utils import multi_apply -from mmdet.registry import MODELS -from mmdet.structures import SampleList -from mmdet.structures.bbox import (bbox_cxcywh_to_xyxy, bbox_overlaps, - bbox_xyxy_to_cxcywh) -from mmdet.utils import InstanceList, reduce_mean - - -@MODELS.register_module() -class CoDINOHead(DINOHead): - - def __init__(self, - *args, - num_query=900, - transformer=None, - in_channels=2048, - max_pos_coords=300, - dn_cfg=None, - use_zero_padding=False, - positional_encoding=dict( - type='SinePositionalEncoding', - num_feats=128, - normalize=True), - **kwargs): - self.with_box_refine = True - self.mixed_selection = True - self.in_channels = in_channels - self.max_pos_coords = max_pos_coords - self.positional_encoding = positional_encoding - self.num_query = num_query - self.use_zero_padding = use_zero_padding - - if 'two_stage_num_proposals' in transformer: - assert transformer['two_stage_num_proposals'] == num_query, \ - 'two_stage_num_proposals must be equal to num_query for DINO' - else: - transformer['two_stage_num_proposals'] = num_query - transformer['as_two_stage'] = True - if self.mixed_selection: - transformer['mixed_selection'] = self.mixed_selection - self.transformer = transformer - self.act_cfg = transformer.get('act_cfg', - dict(type='ReLU', inplace=True)) - - super().__init__(*args, **kwargs) - - self.activate = MODELS.build(self.act_cfg) - self.positional_encoding = MODELS.build(self.positional_encoding) - self.init_denoising(dn_cfg) - - def _init_layers(self): - self.transformer = MODELS.build(self.transformer) - self.embed_dims = self.transformer.embed_dims - assert hasattr(self.positional_encoding, 'num_feats') - num_feats = self.positional_encoding.num_feats - assert num_feats * 2 == self.embed_dims, 'embed_dims should' \ - f' be exactly 2 times of num_feats. Found {self.embed_dims}' \ - f' and {num_feats}.' - """Initialize classification branch and regression branch of head.""" - fc_cls = Linear(self.embed_dims, self.cls_out_channels) - reg_branch = [] - for _ in range(self.num_reg_fcs): - reg_branch.append(Linear(self.embed_dims, self.embed_dims)) - reg_branch.append(nn.ReLU()) - reg_branch.append(Linear(self.embed_dims, 4)) - reg_branch = nn.Sequential(*reg_branch) - - def _get_clones(module, N): - return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) - - # last reg_branch is used to generate proposal from - # encode feature map when as_two_stage is True. - num_pred = (self.transformer.decoder.num_layers + 1) if \ - self.as_two_stage else self.transformer.decoder.num_layers - - self.cls_branches = _get_clones(fc_cls, num_pred) - self.reg_branches = _get_clones(reg_branch, num_pred) - - self.downsample = nn.Sequential( - nn.Conv2d( - self.embed_dims, - self.embed_dims, - kernel_size=3, - stride=2, - padding=1), nn.GroupNorm(32, self.embed_dims)) - - def init_denoising(self, dn_cfg): - if dn_cfg is not None: - dn_cfg['num_classes'] = self.num_classes - dn_cfg['num_matching_queries'] = self.num_query - dn_cfg['embed_dims'] = self.embed_dims - self.dn_generator = CdnQueryGenerator(**dn_cfg) - - def forward(self, - mlvl_feats, - img_metas, - dn_label_query=None, - dn_bbox_query=None, - attn_mask=None): - batch_size = mlvl_feats[0].size(0) - input_img_h, input_img_w = img_metas[0]['batch_input_shape'] - img_masks = mlvl_feats[0].new_ones( - (batch_size, input_img_h, input_img_w)) - for img_id in range(batch_size): - img_h, img_w = img_metas[img_id]['img_shape'] - img_masks[img_id, :img_h, :img_w] = 0 - - mlvl_masks = [] - mlvl_positional_encodings = [] - for feat in mlvl_feats: - mlvl_masks.append( - F.interpolate(img_masks[None], - size=feat.shape[-2:]).to(torch.bool).squeeze(0)) - mlvl_positional_encodings.append( - self.positional_encoding(mlvl_masks[-1])) - - query_embeds = None - hs, inter_references, topk_score, topk_anchor, enc_outputs = \ - self.transformer( - mlvl_feats, - mlvl_masks, - query_embeds, - mlvl_positional_encodings, - dn_label_query, - dn_bbox_query, - attn_mask, - reg_branches=self.reg_branches if self.with_box_refine else None, # noqa:E501 - cls_branches=self.cls_branches if self.as_two_stage else None # noqa:E501 - ) - outs = [] - num_level = len(mlvl_feats) - start = 0 - for lvl in range(num_level): - bs, c, h, w = mlvl_feats[lvl].shape - end = start + h * w - feat = enc_outputs[start:end].permute(1, 2, 0).contiguous() - start = end - outs.append(feat.reshape(bs, c, h, w)) - outs.append(self.downsample(outs[-1])) - - hs = hs.permute(0, 2, 1, 3) - - if dn_label_query is not None and dn_label_query.size(1) == 0: - # NOTE: If there is no target in the image, the parameters of - # label_embedding won't be used in producing loss, which raises - # RuntimeError when using distributed mode. - hs[0] += self.dn_generator.label_embedding.weight[0, 0] * 0.0 - - outputs_classes = [] - outputs_coords = [] - - for lvl in range(hs.shape[0]): - reference = inter_references[lvl] - reference = inverse_sigmoid(reference, eps=1e-3) - outputs_class = self.cls_branches[lvl](hs[lvl]) - tmp = self.reg_branches[lvl](hs[lvl]) - if reference.shape[-1] == 4: - tmp += reference - else: - assert reference.shape[-1] == 2 - tmp[..., :2] += reference - outputs_coord = tmp.sigmoid() - outputs_classes.append(outputs_class) - outputs_coords.append(outputs_coord) - - outputs_classes = torch.stack(outputs_classes) - outputs_coords = torch.stack(outputs_coords) - - return outputs_classes, outputs_coords, topk_score, topk_anchor, outs - - def predict(self, - feats: List[Tensor], - batch_data_samples: SampleList, - rescale: bool = True) -> InstanceList: - batch_img_metas = [ - data_samples.metainfo for data_samples in batch_data_samples - ] - outs = self.forward(feats, batch_img_metas) - - predictions = self.predict_by_feat( - *outs, batch_img_metas=batch_img_metas, rescale=rescale) - - return predictions - - def predict_by_feat(self, - all_cls_scores, - all_bbox_preds, - enc_cls_scores, - enc_bbox_preds, - enc_outputs, - batch_img_metas, - rescale=True): - - cls_scores = all_cls_scores[-1] - bbox_preds = all_bbox_preds[-1] - - result_list = [] - for img_id in range(len(batch_img_metas)): - cls_score = cls_scores[img_id] - bbox_pred = bbox_preds[img_id] - img_meta = batch_img_metas[img_id] - results = self._predict_by_feat_single(cls_score, bbox_pred, - img_meta, rescale) - result_list.append(results) - return result_list - - def _predict_by_feat_single(self, - cls_score: Tensor, - bbox_pred: Tensor, - img_meta: dict, - rescale: bool = True) -> InstanceData: - """Transform outputs from the last decoder layer into bbox predictions - for each image. - - Args: - cls_score (Tensor): Box score logits from the last decoder layer - for each image. Shape [num_queries, cls_out_channels]. - bbox_pred (Tensor): Sigmoid outputs from the last decoder layer - for each image, with coordinate format (cx, cy, w, h) and - shape [num_queries, 4]. - img_meta (dict): Image meta info. - rescale (bool): If True, return boxes in original image - space. Default True. - - Returns: - :obj:`InstanceData`: Detection results of each image - after the post process. - Each item usually contains following keys. - - - scores (Tensor): Classification scores, has a shape - (num_instance, ) - - labels (Tensor): Labels of bboxes, has a shape - (num_instances, ). - - bboxes (Tensor): Has a shape (num_instances, 4), - the last dimension 4 arrange as (x1, y1, x2, y2). - """ - assert len(cls_score) == len(bbox_pred) # num_queries - max_per_img = self.test_cfg.get('max_per_img', self.num_query) - score_thr = self.test_cfg.get('score_thr', 0) - with_nms = self.test_cfg.get('nms', None) - - img_shape = img_meta['img_shape'] - # exclude background - if self.loss_cls.use_sigmoid: - cls_score = cls_score.sigmoid() - scores, indexes = cls_score.view(-1).topk(max_per_img) - det_labels = indexes % self.num_classes - bbox_index = indexes // self.num_classes - bbox_pred = bbox_pred[bbox_index] - else: - scores, det_labels = F.softmax(cls_score, dim=-1)[..., :-1].max(-1) - scores, bbox_index = scores.topk(max_per_img) - bbox_pred = bbox_pred[bbox_index] - det_labels = det_labels[bbox_index] - - if score_thr > 0: - valid_mask = scores > score_thr - scores = scores[valid_mask] - bbox_pred = bbox_pred[valid_mask] - det_labels = det_labels[valid_mask] - - det_bboxes = bbox_cxcywh_to_xyxy(bbox_pred) - det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1] - det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0] - det_bboxes[:, 0::2].clamp_(min=0, max=img_shape[1]) - det_bboxes[:, 1::2].clamp_(min=0, max=img_shape[0]) - if rescale: - assert img_meta.get('scale_factor') is not None - det_bboxes /= det_bboxes.new_tensor( - img_meta['scale_factor']).repeat((1, 2)) - - results = InstanceData() - results.bboxes = det_bboxes - results.scores = scores - results.labels = det_labels - - if with_nms and results.bboxes.numel() > 0: - det_bboxes, keep_idxs = batched_nms(results.bboxes, results.scores, - results.labels, - self.test_cfg.nms) - results = results[keep_idxs] - results.scores = det_bboxes[:, -1] - results = results[:max_per_img] - - return results - - def loss(self, x, batch_data_samples): - assert self.dn_generator is not None, '"dn_cfg" must be set' - - batch_gt_instances = [] - batch_img_metas = [] - for data_sample in batch_data_samples: - batch_img_metas.append(data_sample.metainfo) - batch_gt_instances.append(data_sample.gt_instances) - - dn_label_query, dn_bbox_query, attn_mask, dn_meta = \ - self.dn_generator(batch_data_samples) - - outs = self(x, batch_img_metas, dn_label_query, dn_bbox_query, - attn_mask) - - loss_inputs = outs[:-1] + (batch_gt_instances, batch_img_metas, - dn_meta) - losses = self.loss_by_feat(*loss_inputs) - enc_outputs = outs[-1] - return losses, enc_outputs - - def forward_aux(self, mlvl_feats, img_metas, aux_targets, head_idx): - """Forward function. - - Args: - mlvl_feats (tuple[Tensor]): Features from the upstream - network, each is a 4D-tensor with shape - (N, C, H, W). - img_metas (list[dict]): List of image information. - - Returns: - all_cls_scores (Tensor): Outputs from the classification head, \ - shape [nb_dec, bs, num_query, cls_out_channels]. Note \ - cls_out_channels should includes background. - all_bbox_preds (Tensor): Sigmoid outputs from the regression \ - head with normalized coordinate format (cx, cy, w, h). \ - Shape [nb_dec, bs, num_query, 4]. - enc_outputs_class (Tensor): The score of each point on encode \ - feature map, has shape (N, h*w, num_class). Only when \ - as_two_stage is True it would be returned, otherwise \ - `None` would be returned. - enc_outputs_coord (Tensor): The proposal generate from the \ - encode feature map, has shape (N, h*w, 4). Only when \ - as_two_stage is True it would be returned, otherwise \ - `None` would be returned. - """ - aux_coords, aux_labels, aux_targets, aux_label_weights, \ - aux_bbox_weights, aux_feats, attn_masks = aux_targets - batch_size = mlvl_feats[0].size(0) - input_img_h, input_img_w = img_metas[0]['batch_input_shape'] - img_masks = mlvl_feats[0].new_ones( - (batch_size, input_img_h, input_img_w)) - for img_id in range(batch_size): - img_h, img_w = img_metas[img_id]['img_shape'] - img_masks[img_id, :img_h, :img_w] = 0 - - mlvl_masks = [] - mlvl_positional_encodings = [] - for feat in mlvl_feats: - mlvl_masks.append( - F.interpolate(img_masks[None], - size=feat.shape[-2:]).to(torch.bool).squeeze(0)) - mlvl_positional_encodings.append( - self.positional_encoding(mlvl_masks[-1])) - - query_embeds = None - hs, inter_references = self.transformer.forward_aux( - mlvl_feats, - mlvl_masks, - query_embeds, - mlvl_positional_encodings, - aux_coords, - pos_feats=aux_feats, - reg_branches=self.reg_branches if self.with_box_refine else None, - cls_branches=self.cls_branches if self.as_two_stage else None, - return_encoder_output=True, - attn_masks=attn_masks, - head_idx=head_idx) - - hs = hs.permute(0, 2, 1, 3) - outputs_classes = [] - outputs_coords = [] - - for lvl in range(hs.shape[0]): - reference = inter_references[lvl] - reference = inverse_sigmoid(reference, eps=1e-3) - outputs_class = self.cls_branches[lvl](hs[lvl]) - tmp = self.reg_branches[lvl](hs[lvl]) - if reference.shape[-1] == 4: - tmp += reference - else: - assert reference.shape[-1] == 2 - tmp[..., :2] += reference - outputs_coord = tmp.sigmoid() - outputs_classes.append(outputs_class) - outputs_coords.append(outputs_coord) - - outputs_classes = torch.stack(outputs_classes) - outputs_coords = torch.stack(outputs_coords) - - return outputs_classes, outputs_coords, None, None - - def loss_aux(self, - x, - pos_coords=None, - head_idx=0, - batch_data_samples=None): - batch_gt_instances = [] - batch_img_metas = [] - for data_sample in batch_data_samples: - batch_img_metas.append(data_sample.metainfo) - batch_gt_instances.append(data_sample.gt_instances) - - gt_bboxes = [b.bboxes for b in batch_gt_instances] - gt_labels = [b.labels for b in batch_gt_instances] - - aux_targets = self.get_aux_targets(pos_coords, batch_img_metas, x, - head_idx) - outs = self.forward_aux(x[:-1], batch_img_metas, aux_targets, head_idx) - outs = outs + aux_targets - if gt_labels is None: - loss_inputs = outs + (gt_bboxes, batch_img_metas) - else: - loss_inputs = outs + (gt_bboxes, gt_labels, batch_img_metas) - losses = self.loss_aux_by_feat(*loss_inputs) - return losses - - def get_aux_targets(self, pos_coords, img_metas, mlvl_feats, head_idx): - coords, labels, targets = pos_coords[:3] - head_name = pos_coords[-1] - bs, c = len(coords), mlvl_feats[0].shape[1] - max_num_coords = 0 - all_feats = [] - for i in range(bs): - label = labels[i] - feats = [ - feat[i].reshape(c, -1).transpose(1, 0) for feat in mlvl_feats - ] - feats = torch.cat(feats, dim=0) - bg_class_ind = self.num_classes - pos_inds = ((label >= 0) - & (label < bg_class_ind)).nonzero().squeeze(1) - max_num_coords = max(max_num_coords, len(pos_inds)) - all_feats.append(feats) - max_num_coords = min(self.max_pos_coords, max_num_coords) - max_num_coords = max(9, max_num_coords) - - if self.use_zero_padding: - attn_masks = [] - label_weights = coords[0].new_zeros([bs, max_num_coords]) - else: - attn_masks = None - label_weights = coords[0].new_ones([bs, max_num_coords]) - bbox_weights = coords[0].new_zeros([bs, max_num_coords, 4]) - - aux_coords, aux_labels, aux_targets, aux_feats = [], [], [], [] - - for i in range(bs): - coord, label, target = coords[i], labels[i], targets[i] - feats = all_feats[i] - if 'rcnn' in head_name: - feats = pos_coords[-2][i] - num_coords_per_point = 1 - else: - num_coords_per_point = coord.shape[0] // feats.shape[0] - feats = feats.unsqueeze(1).repeat(1, num_coords_per_point, 1) - feats = feats.reshape(feats.shape[0] * num_coords_per_point, - feats.shape[-1]) - img_meta = img_metas[i] - img_h, img_w = img_meta['img_shape'] - factor = coord.new_tensor([img_w, img_h, img_w, - img_h]).unsqueeze(0) - bg_class_ind = self.num_classes - pos_inds = ((label >= 0) - & (label < bg_class_ind)).nonzero().squeeze(1) - neg_inds = (label == bg_class_ind).nonzero().squeeze(1) - if pos_inds.shape[0] > max_num_coords: - indices = torch.randperm( - pos_inds.shape[0])[:max_num_coords].cuda() - pos_inds = pos_inds[indices] - - coord = bbox_xyxy_to_cxcywh(coord[pos_inds] / factor) - label = label[pos_inds] - target = bbox_xyxy_to_cxcywh(target[pos_inds] / factor) - feat = feats[pos_inds] - - if self.use_zero_padding: - label_weights[i][:len(label)] = 1 - bbox_weights[i][:len(label)] = 1 - attn_mask = torch.zeros([ - max_num_coords, - max_num_coords, - ]).bool().to(coord.device) - else: - bbox_weights[i][:len(label)] = 1 - - if coord.shape[0] < max_num_coords: - padding_shape = max_num_coords - coord.shape[0] - if self.use_zero_padding: - padding_coord = coord.new_zeros([padding_shape, 4]) - padding_label = label.new_ones([padding_shape - ]) * self.num_classes - padding_target = target.new_zeros([padding_shape, 4]) - padding_feat = feat.new_zeros([padding_shape, c]) - attn_mask[coord.shape[0]:, 0:coord.shape[0], ] = True - attn_mask[:, coord.shape[0]:, ] = True - else: - indices = torch.randperm( - neg_inds.shape[0])[:padding_shape].cuda() - neg_inds = neg_inds[indices] - padding_coord = bbox_xyxy_to_cxcywh(coords[i][neg_inds] / - factor) - padding_label = labels[i][neg_inds] - padding_target = bbox_xyxy_to_cxcywh(targets[i][neg_inds] / - factor) - padding_feat = feats[neg_inds] - coord = torch.cat((coord, padding_coord), dim=0) - label = torch.cat((label, padding_label), dim=0) - target = torch.cat((target, padding_target), dim=0) - feat = torch.cat((feat, padding_feat), dim=0) - if self.use_zero_padding: - attn_masks.append(attn_mask.unsqueeze(0)) - aux_coords.append(coord.unsqueeze(0)) - aux_labels.append(label.unsqueeze(0)) - aux_targets.append(target.unsqueeze(0)) - aux_feats.append(feat.unsqueeze(0)) - - if self.use_zero_padding: - attn_masks = torch.cat( - attn_masks, dim=0).unsqueeze(1).repeat(1, 8, 1, 1) - attn_masks = attn_masks.reshape(bs * 8, max_num_coords, - max_num_coords) - else: - attn_masks = None - - aux_coords = torch.cat(aux_coords, dim=0) - aux_labels = torch.cat(aux_labels, dim=0) - aux_targets = torch.cat(aux_targets, dim=0) - aux_feats = torch.cat(aux_feats, dim=0) - aux_label_weights = label_weights - aux_bbox_weights = bbox_weights - return (aux_coords, aux_labels, aux_targets, aux_label_weights, - aux_bbox_weights, aux_feats, attn_masks) - - def loss_aux_by_feat(self, - all_cls_scores, - all_bbox_preds, - enc_cls_scores, - enc_bbox_preds, - aux_coords, - aux_labels, - aux_targets, - aux_label_weights, - aux_bbox_weights, - aux_feats, - attn_masks, - gt_bboxes_list, - gt_labels_list, - img_metas, - gt_bboxes_ignore=None): - num_dec_layers = len(all_cls_scores) - all_labels = [aux_labels for _ in range(num_dec_layers)] - all_label_weights = [aux_label_weights for _ in range(num_dec_layers)] - all_bbox_targets = [aux_targets for _ in range(num_dec_layers)] - all_bbox_weights = [aux_bbox_weights for _ in range(num_dec_layers)] - img_metas_list = [img_metas for _ in range(num_dec_layers)] - all_gt_bboxes_ignore_list = [ - gt_bboxes_ignore for _ in range(num_dec_layers) - ] - - losses_cls, losses_bbox, losses_iou = multi_apply( - self._loss_aux_by_feat_single, all_cls_scores, all_bbox_preds, - all_labels, all_label_weights, all_bbox_targets, all_bbox_weights, - img_metas_list, all_gt_bboxes_ignore_list) - - loss_dict = dict() - # loss of proposal generated from encode feature map. - - # loss from the last decoder layer - loss_dict['loss_cls_aux'] = losses_cls[-1] - loss_dict['loss_bbox_aux'] = losses_bbox[-1] - loss_dict['loss_iou_aux'] = losses_iou[-1] - # loss from other decoder layers - num_dec_layer = 0 - for loss_cls_i, loss_bbox_i, loss_iou_i in zip(losses_cls[:-1], - losses_bbox[:-1], - losses_iou[:-1]): - loss_dict[f'd{num_dec_layer}.loss_cls_aux'] = loss_cls_i - loss_dict[f'd{num_dec_layer}.loss_bbox_aux'] = loss_bbox_i - loss_dict[f'd{num_dec_layer}.loss_iou_aux'] = loss_iou_i - num_dec_layer += 1 - return loss_dict - - def _loss_aux_by_feat_single(self, - cls_scores, - bbox_preds, - labels, - label_weights, - bbox_targets, - bbox_weights, - img_metas, - gt_bboxes_ignore_list=None): - num_imgs = cls_scores.size(0) - num_q = cls_scores.size(1) - - try: - labels = labels.reshape(num_imgs * num_q) - label_weights = label_weights.reshape(num_imgs * num_q) - bbox_targets = bbox_targets.reshape(num_imgs * num_q, 4) - bbox_weights = bbox_weights.reshape(num_imgs * num_q, 4) - except Exception: - return cls_scores.mean() * 0, cls_scores.mean( - ) * 0, cls_scores.mean() * 0 - - bg_class_ind = self.num_classes - num_total_pos = len( - ((labels >= 0) & (labels < bg_class_ind)).nonzero().squeeze(1)) - num_total_neg = num_imgs * num_q - num_total_pos - - # classification loss - cls_scores = cls_scores.reshape(-1, self.cls_out_channels) - # construct weighted avg_factor to match with the official DETR repo - cls_avg_factor = num_total_pos * 1.0 + \ - num_total_neg * self.bg_cls_weight - if self.sync_cls_avg_factor: - cls_avg_factor = reduce_mean( - cls_scores.new_tensor([cls_avg_factor])) - cls_avg_factor = max(cls_avg_factor, 1) - - bg_class_ind = self.num_classes - pos_inds = ((labels >= 0) - & (labels < bg_class_ind)).nonzero().squeeze(1) - scores = label_weights.new_zeros(labels.shape) - pos_bbox_targets = bbox_targets[pos_inds] - pos_decode_bbox_targets = bbox_cxcywh_to_xyxy(pos_bbox_targets) - pos_bbox_pred = bbox_preds.reshape(-1, 4)[pos_inds] - pos_decode_bbox_pred = bbox_cxcywh_to_xyxy(pos_bbox_pred) - scores[pos_inds] = bbox_overlaps( - pos_decode_bbox_pred.detach(), - pos_decode_bbox_targets, - is_aligned=True) - loss_cls = self.loss_cls( - cls_scores, (labels, scores), - weight=label_weights, - avg_factor=cls_avg_factor) - - # Compute the average number of gt boxes across all gpus, for - # normalization purposes - num_total_pos = loss_cls.new_tensor([num_total_pos]) - num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item() - - # construct factors used for rescale bboxes - factors = [] - for img_meta, bbox_pred in zip(img_metas, bbox_preds): - img_h, img_w = img_meta['img_shape'] - factor = bbox_pred.new_tensor([img_w, img_h, img_w, - img_h]).unsqueeze(0).repeat( - bbox_pred.size(0), 1) - factors.append(factor) - factors = torch.cat(factors, 0) - - # DETR regress the relative position of boxes (cxcywh) in the image, - # thus the learning target is normalized by the image size. So here - # we need to re-scale them for calculating IoU loss - bbox_preds = bbox_preds.reshape(-1, 4) - bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors - bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors - - # regression IoU loss, defaultly GIoU loss - loss_iou = self.loss_iou( - bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos) - - # regression L1 loss - loss_bbox = self.loss_bbox( - bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos) - return loss_cls, loss_bbox, loss_iou diff --git a/projects/CO-DETR/codetr/co_roi_head.py b/projects/CO-DETR/codetr/co_roi_head.py deleted file mode 100644 index 9aafb53be..000000000 --- a/projects/CO-DETR/codetr/co_roi_head.py +++ /dev/null @@ -1,108 +0,0 @@ -from typing import List, Tuple - -import torch -from torch import Tensor - -from mmdet.models.roi_heads import StandardRoIHead -from mmdet.models.task_modules.samplers import SamplingResult -from mmdet.models.utils import unpack_gt_instances -from mmdet.registry import MODELS -from mmdet.structures import DetDataSample -from mmdet.structures.bbox import bbox2roi -from mmdet.utils import InstanceList - - -@MODELS.register_module() -class CoStandardRoIHead(StandardRoIHead): - - def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList, - batch_data_samples: List[DetDataSample]) -> dict: - max_proposal = 2000 - - assert len(rpn_results_list) == len(batch_data_samples) - outputs = unpack_gt_instances(batch_data_samples) - batch_gt_instances, batch_gt_instances_ignore, _ = outputs - - # assign gts and sample proposals - num_imgs = len(batch_data_samples) - sampling_results = [] - for i in range(num_imgs): - # rename rpn_results.bboxes to rpn_results.priors - rpn_results = rpn_results_list[i] - rpn_results.priors = rpn_results.pop('bboxes') - - assign_result = self.bbox_assigner.assign( - rpn_results, batch_gt_instances[i], - batch_gt_instances_ignore[i]) - sampling_result = self.bbox_sampler.sample( - assign_result, - rpn_results, - batch_gt_instances[i], - feats=[lvl_feat[i][None] for lvl_feat in x]) - sampling_results.append(sampling_result) - - losses = dict() - # bbox head forward and loss - if self.with_bbox: - bbox_results = self.bbox_loss(x, sampling_results) - losses.update(bbox_results['loss_bbox']) - - bbox_targets = bbox_results['bbox_targets'] - for res in sampling_results: - max_proposal = min(max_proposal, res.bboxes.shape[0]) - ori_coords = bbox2roi([res.bboxes for res in sampling_results]) - ori_proposals, ori_labels, \ - ori_bbox_targets, ori_bbox_feats = [], [], [], [] - for i in range(num_imgs): - idx = (ori_coords[:, 0] == i).nonzero().squeeze(1) - idx = idx[:max_proposal] - ori_proposal = ori_coords[idx][:, 1:].unsqueeze(0) - ori_label = bbox_targets[0][idx].unsqueeze(0) - ori_bbox_target = bbox_targets[2][idx].unsqueeze(0) - ori_bbox_feat = bbox_results['bbox_feats'].mean(-1).mean(-1) - ori_bbox_feat = ori_bbox_feat[idx].unsqueeze(0) - ori_proposals.append(ori_proposal) - ori_labels.append(ori_label) - ori_bbox_targets.append(ori_bbox_target) - ori_bbox_feats.append(ori_bbox_feat) - ori_coords = torch.cat(ori_proposals, dim=0) - ori_labels = torch.cat(ori_labels, dim=0) - ori_bbox_targets = torch.cat(ori_bbox_targets, dim=0) - ori_bbox_feats = torch.cat(ori_bbox_feats, dim=0) - pos_coords = (ori_coords, ori_labels, ori_bbox_targets, - ori_bbox_feats, 'rcnn') - losses.update(pos_coords=pos_coords) - - return losses - - def bbox_loss(self, x: Tuple[Tensor], - sampling_results: List[SamplingResult]) -> dict: - """Perform forward propagation and loss calculation of the bbox head on - the features of the upstream network. - - Args: - x (tuple[Tensor]): List of multi-level img features. - sampling_results (list["obj:`SamplingResult`]): Sampling results. - - Returns: - dict[str, Tensor]: Usually returns a dictionary with keys: - - - `cls_score` (Tensor): Classification scores. - - `bbox_pred` (Tensor): Box energies / deltas. - - `bbox_feats` (Tensor): Extract bbox RoI features. - - `loss_bbox` (dict): A dictionary of bbox loss components. - """ - rois = bbox2roi([res.priors for res in sampling_results]) - bbox_results = self._bbox_forward(x, rois) - - bbox_loss_and_target = self.bbox_head.loss_and_target( - cls_score=bbox_results['cls_score'], - bbox_pred=bbox_results['bbox_pred'], - rois=rois, - sampling_results=sampling_results, - rcnn_train_cfg=self.train_cfg) - - bbox_results.update(loss_bbox=bbox_loss_and_target['loss_bbox']) - # diff - bbox_results.update(bbox_targets=bbox_loss_and_target['bbox_targets']) - return bbox_results diff --git a/projects/CO-DETR/codetr/codetr.py b/projects/CO-DETR/codetr/codetr.py deleted file mode 100644 index 82826f641..000000000 --- a/projects/CO-DETR/codetr/codetr.py +++ /dev/null @@ -1,320 +0,0 @@ -import copy -from typing import Tuple, Union - -import torch -import torch.nn as nn -from torch import Tensor - -from mmdet.models.detectors.base import BaseDetector -from mmdet.registry import MODELS -from mmdet.structures import OptSampleList, SampleList -from mmdet.utils import InstanceList, OptConfigType, OptMultiConfig - - -@MODELS.register_module() -class CoDETR(BaseDetector): - - def __init__( - self, - backbone, - neck=None, - query_head=None, # detr head - rpn_head=None, # two-stage rpn - roi_head=[None], # two-stage - bbox_head=[None], # one-stage - train_cfg=[None, None], - test_cfg=[None, None], - # Control whether to consider positive samples - # from the auxiliary head as additional positive queries. - with_pos_coord=True, - use_lsj=True, - eval_module='detr', - # Evaluate the Nth head. - eval_index=0, - data_preprocessor: OptConfigType = None, - init_cfg: OptMultiConfig = None): - super(CoDETR, self).__init__( - data_preprocessor=data_preprocessor, init_cfg=init_cfg) - self.with_pos_coord = with_pos_coord - self.use_lsj = use_lsj - - assert eval_module in ['detr', 'one-stage', 'two-stage'] - self.eval_module = eval_module - - self.backbone = MODELS.build(backbone) - if neck is not None: - self.neck = MODELS.build(neck) - # Module index for evaluation - self.eval_index = eval_index - head_idx = 0 - if query_head is not None: - query_head.update(train_cfg=train_cfg[head_idx] if ( - train_cfg is not None and train_cfg[head_idx] is not None - ) else None) - query_head.update(test_cfg=test_cfg[head_idx]) - self.query_head = MODELS.build(query_head) - self.query_head.init_weights() - head_idx += 1 - - if rpn_head is not None: - rpn_train_cfg = train_cfg[head_idx].rpn if ( - train_cfg is not None - and train_cfg[head_idx] is not None) else None - rpn_head_ = rpn_head.copy() - rpn_head_.update( - train_cfg=rpn_train_cfg, test_cfg=test_cfg[head_idx].rpn) - self.rpn_head = MODELS.build(rpn_head_) - self.rpn_head.init_weights() - - self.roi_head = nn.ModuleList() - for i in range(len(roi_head)): - if roi_head[i]: - rcnn_train_cfg = train_cfg[i + head_idx].rcnn if ( - train_cfg - and train_cfg[i + head_idx] is not None) else None - roi_head[i].update(train_cfg=rcnn_train_cfg) - roi_head[i].update(test_cfg=test_cfg[i + head_idx].rcnn) - self.roi_head.append(MODELS.build(roi_head[i])) - self.roi_head[-1].init_weights() - - self.bbox_head = nn.ModuleList() - for i in range(len(bbox_head)): - if bbox_head[i]: - bbox_head[i].update( - train_cfg=train_cfg[i + head_idx + len(self.roi_head)] if ( - train_cfg and train_cfg[i + head_idx + - len(self.roi_head)] is not None - ) else None) - bbox_head[i].update(test_cfg=test_cfg[i + head_idx + - len(self.roi_head)]) - self.bbox_head.append(MODELS.build(bbox_head[i])) - self.bbox_head[-1].init_weights() - - self.head_idx = head_idx - self.train_cfg = train_cfg - self.test_cfg = test_cfg - - @property - def with_rpn(self): - """bool: whether the detector has RPN""" - return hasattr(self, 'rpn_head') and self.rpn_head is not None - - @property - def with_query_head(self): - """bool: whether the detector has a RoI head""" - return hasattr(self, 'query_head') and self.query_head is not None - - @property - def with_roi_head(self): - """bool: whether the detector has a RoI head""" - return hasattr(self, 'roi_head') and self.roi_head is not None and len( - self.roi_head) > 0 - - @property - def with_shared_head(self): - """bool: whether the detector has a shared head in the RoI Head""" - return hasattr(self, 'roi_head') and self.roi_head[0].with_shared_head - - @property - def with_bbox(self): - """bool: whether the detector has a bbox head""" - return ((hasattr(self, 'roi_head') and self.roi_head is not None - and len(self.roi_head) > 0) - or (hasattr(self, 'bbox_head') and self.bbox_head is not None - and len(self.bbox_head) > 0)) - - def extract_feat(self, batch_inputs: Tensor) -> Tuple[Tensor]: - """Extract features. - - Args: - batch_inputs (Tensor): Image tensor, has shape (bs, dim, H, W). - - Returns: - tuple[Tensor]: Tuple of feature maps from neck. Each feature map - has shape (bs, dim, H, W). - """ - x = self.backbone(batch_inputs) - if self.with_neck: - x = self.neck(x) - return x - - def _forward(self, - batch_inputs: Tensor, - batch_data_samples: OptSampleList = None): - pass - - def loss(self, batch_inputs: Tensor, - batch_data_samples: SampleList) -> Union[dict, list]: - batch_input_shape = batch_data_samples[0].batch_input_shape - if self.use_lsj: - for data_samples in batch_data_samples: - img_metas = data_samples.metainfo - input_img_h, input_img_w = batch_input_shape - img_metas['img_shape'] = [input_img_h, input_img_w] - - x = self.extract_feat(batch_inputs) - - losses = dict() - - def upd_loss(losses, idx, weight=1): - new_losses = dict() - for k, v in losses.items(): - new_k = '{}{}'.format(k, idx) - if isinstance(v, list) or isinstance(v, tuple): - new_losses[new_k] = [i * weight for i in v] - else: - new_losses[new_k] = v * weight - return new_losses - - # DETR encoder and decoder forward - if self.with_query_head: - bbox_losses, x = self.query_head.loss(x, batch_data_samples) - losses.update(bbox_losses) - - # RPN forward and loss - if self.with_rpn: - proposal_cfg = self.train_cfg[self.head_idx].get( - 'rpn_proposal', self.test_cfg[self.head_idx].rpn) - - rpn_data_samples = copy.deepcopy(batch_data_samples) - # set cat_id of gt_labels to 0 in RPN - for data_sample in rpn_data_samples: - data_sample.gt_instances.labels = \ - torch.zeros_like(data_sample.gt_instances.labels) - - rpn_losses, proposal_list = self.rpn_head.loss_and_predict( - x, rpn_data_samples, proposal_cfg=proposal_cfg) - - # avoid get same name with roi_head loss - keys = rpn_losses.keys() - for key in list(keys): - if 'loss' in key and 'rpn' not in key: - rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key) - - losses.update(rpn_losses) - else: - assert batch_data_samples[0].get('proposals', None) is not None - # use pre-defined proposals in InstanceData for the second stage - # to extract ROI features. - proposal_list = [ - data_sample.proposals for data_sample in batch_data_samples - ] - - positive_coords = [] - for i in range(len(self.roi_head)): - roi_losses = self.roi_head[i].loss(x, proposal_list, - batch_data_samples) - if self.with_pos_coord: - positive_coords.append(roi_losses.pop('pos_coords')) - else: - if 'pos_coords' in roi_losses.keys(): - roi_losses.pop('pos_coords') - roi_losses = upd_loss(roi_losses, idx=i) - losses.update(roi_losses) - - for i in range(len(self.bbox_head)): - bbox_losses = self.bbox_head[i].loss(x, batch_data_samples) - if self.with_pos_coord: - pos_coords = bbox_losses.pop('pos_coords') - positive_coords.append(pos_coords) - else: - if 'pos_coords' in bbox_losses.keys(): - bbox_losses.pop('pos_coords') - bbox_losses = upd_loss(bbox_losses, idx=i + len(self.roi_head)) - losses.update(bbox_losses) - - if self.with_pos_coord and len(positive_coords) > 0: - for i in range(len(positive_coords)): - bbox_losses = self.query_head.loss_aux(x, positive_coords[i], - i, batch_data_samples) - bbox_losses = upd_loss(bbox_losses, idx=i) - losses.update(bbox_losses) - - return losses - - def predict(self, - batch_inputs: Tensor, - batch_data_samples: SampleList, - rescale: bool = True) -> SampleList: - """Predict results from a batch of inputs and data samples with post- - processing. - - Args: - batch_inputs (Tensor): Inputs, has shape (bs, dim, H, W). - batch_data_samples (List[:obj:`DetDataSample`]): The batch - data samples. It usually includes information such - as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. - rescale (bool): Whether to rescale the results. - Defaults to True. - - Returns: - list[:obj:`DetDataSample`]: Detection results of the input images. - Each DetDataSample usually contain 'pred_instances'. And the - `pred_instances` usually contains following keys. - - - scores (Tensor): Classification scores, has a shape - (num_instance, ) - - labels (Tensor): Labels of bboxes, has a shape - (num_instances, ). - - bboxes (Tensor): Has a shape (num_instances, 4), - the last dimension 4 arrange as (x1, y1, x2, y2). - """ - assert self.eval_module in ['detr', 'one-stage', 'two-stage'] - - if self.use_lsj: - for data_samples in batch_data_samples: - img_metas = data_samples.metainfo - input_img_h, input_img_w = img_metas['batch_input_shape'] - img_metas['img_shape'] = [input_img_h, input_img_w] - - img_feats = self.extract_feat(batch_inputs) - if self.with_bbox and self.eval_module == 'one-stage': - results_list = self.predict_bbox_head( - img_feats, batch_data_samples, rescale=rescale) - elif self.with_roi_head and self.eval_module == 'two-stage': - results_list = self.predict_roi_head( - img_feats, batch_data_samples, rescale=rescale) - else: - results_list = self.predict_query_head( - img_feats, batch_data_samples, rescale=rescale) - - batch_data_samples = self.add_pred_to_datasample( - batch_data_samples, results_list) - return batch_data_samples - - def predict_query_head(self, - mlvl_feats: Tuple[Tensor], - batch_data_samples: SampleList, - rescale: bool = True) -> InstanceList: - return self.query_head.predict( - mlvl_feats, batch_data_samples=batch_data_samples, rescale=rescale) - - def predict_roi_head(self, - mlvl_feats: Tuple[Tensor], - batch_data_samples: SampleList, - rescale: bool = True) -> InstanceList: - assert self.with_bbox, 'Bbox head must be implemented.' - if self.with_query_head: - batch_img_metas = [ - data_samples.metainfo for data_samples in batch_data_samples - ] - results = self.query_head.forward(mlvl_feats, batch_img_metas) - mlvl_feats = results[-1] - rpn_results_list = self.rpn_head.predict( - mlvl_feats, batch_data_samples, rescale=False) - return self.roi_head[self.eval_index].predict( - mlvl_feats, rpn_results_list, batch_data_samples, rescale=rescale) - - def predict_bbox_head(self, - mlvl_feats: Tuple[Tensor], - batch_data_samples: SampleList, - rescale: bool = True) -> InstanceList: - assert self.with_bbox, 'Bbox head must be implemented.' - if self.with_query_head: - batch_img_metas = [ - data_samples.metainfo for data_samples in batch_data_samples - ] - results = self.query_head.forward(mlvl_feats, batch_img_metas) - mlvl_feats = results[-1] - return self.bbox_head[self.eval_index].predict( - mlvl_feats, batch_data_samples, rescale=rescale) diff --git a/projects/CO-DETR/codetr/transformer.py b/projects/CO-DETR/codetr/transformer.py deleted file mode 100644 index 009f94a8b..000000000 --- a/projects/CO-DETR/codetr/transformer.py +++ /dev/null @@ -1,1376 +0,0 @@ -import math -import warnings - -import torch -import torch.nn as nn -from mmcv.cnn import build_norm_layer -from mmcv.cnn.bricks.transformer import (BaseTransformerLayer, - TransformerLayerSequence, - build_transformer_layer_sequence) -from mmcv.ops import MultiScaleDeformableAttention -from mmengine.model import BaseModule -from mmengine.model.weight_init import xavier_init -from torch.nn.init import normal_ - -from mmdet.models.layers.transformer import inverse_sigmoid -from mmdet.registry import MODELS - -try: - from fairscale.nn.checkpoint import checkpoint_wrapper -except Exception: - checkpoint_wrapper = None - -# In order to save the cost and effort of reproduction, -# I did not refactor it into the style of mmdet 3.x DETR. - - -class Transformer(BaseModule): - """Implements the DETR transformer. - - Following the official DETR implementation, this module copy-paste - from torch.nn.Transformer with modifications: - - * positional encodings are passed in MultiheadAttention - * extra LN at the end of encoder is removed - * decoder returns a stack of activations from all decoding layers - - See `paper: End-to-End Object Detection with Transformers - `_ for details. - - Args: - encoder (`mmcv.ConfigDict` | Dict): Config of - TransformerEncoder. Defaults to None. - decoder ((`mmcv.ConfigDict` | Dict)): Config of - TransformerDecoder. Defaults to None - init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. - Defaults to None. - """ - - def __init__(self, encoder=None, decoder=None, init_cfg=None): - super(Transformer, self).__init__(init_cfg=init_cfg) - self.encoder = build_transformer_layer_sequence(encoder) - self.decoder = build_transformer_layer_sequence(decoder) - self.embed_dims = self.encoder.embed_dims - - def init_weights(self): - # follow the official DETR to init parameters - for m in self.modules(): - if hasattr(m, 'weight') and m.weight.dim() > 1: - xavier_init(m, distribution='uniform') - self._is_init = True - - def forward(self, x, mask, query_embed, pos_embed): - """Forward function for `Transformer`. - - Args: - x (Tensor): Input query with shape [bs, c, h, w] where - c = embed_dims. - mask (Tensor): The key_padding_mask used for encoder and decoder, - with shape [bs, h, w]. - query_embed (Tensor): The query embedding for decoder, with shape - [num_query, c]. - pos_embed (Tensor): The positional encoding for encoder and - decoder, with the same shape as `x`. - - Returns: - tuple[Tensor]: results of decoder containing the following tensor. - - - out_dec: Output from decoder. If return_intermediate_dec \ - is True output has shape [num_dec_layers, bs, - num_query, embed_dims], else has shape [1, bs, \ - num_query, embed_dims]. - - memory: Output results from encoder, with shape \ - [bs, embed_dims, h, w]. - """ - bs, c, h, w = x.shape - # use `view` instead of `flatten` for dynamically exporting to ONNX - x = x.view(bs, c, -1).permute(2, 0, 1) # [bs, c, h, w] -> [h*w, bs, c] - pos_embed = pos_embed.view(bs, c, -1).permute(2, 0, 1) - query_embed = query_embed.unsqueeze(1).repeat( - 1, bs, 1) # [num_query, dim] -> [num_query, bs, dim] - mask = mask.view(bs, -1) # [bs, h, w] -> [bs, h*w] - memory = self.encoder( - query=x, - key=None, - value=None, - query_pos=pos_embed, - query_key_padding_mask=mask) - target = torch.zeros_like(query_embed) - # out_dec: [num_layers, num_query, bs, dim] - out_dec = self.decoder( - query=target, - key=memory, - value=memory, - key_pos=pos_embed, - query_pos=query_embed, - key_padding_mask=mask) - out_dec = out_dec.transpose(1, 2) - memory = memory.permute(1, 2, 0).reshape(bs, c, h, w) - return out_dec, memory - - -@MODELS.register_module(force=True) -class DeformableDetrTransformerDecoder(TransformerLayerSequence): - """Implements the decoder in DETR transformer. - - Args: - return_intermediate (bool): Whether to return intermediate outputs. - coder_norm_cfg (dict): Config of last normalization layer. Default: - `LN`. - """ - - def __init__(self, *args, return_intermediate=False, **kwargs): - - super(DeformableDetrTransformerDecoder, self).__init__(*args, **kwargs) - self.return_intermediate = return_intermediate - - def forward(self, - query, - *args, - reference_points=None, - valid_ratios=None, - reg_branches=None, - **kwargs): - """Forward function for `TransformerDecoder`. - - Args: - query (Tensor): Input query with shape - `(num_query, bs, embed_dims)`. - reference_points (Tensor): The reference - points of offset. has shape - (bs, num_query, 4) when as_two_stage, - otherwise has shape ((bs, num_query, 2). - valid_ratios (Tensor): The radios of valid - points on the feature map, has shape - (bs, num_levels, 2) - reg_branch: (obj:`nn.ModuleList`): Used for - refining the regression results. Only would - be passed when with_box_refine is True, - otherwise would be passed a `None`. - - Returns: - Tensor: Results with shape [1, num_query, bs, embed_dims] when - return_intermediate is `False`, otherwise it has shape - [num_layers, num_query, bs, embed_dims]. - """ - output = query - intermediate = [] - intermediate_reference_points = [] - for lid, layer in enumerate(self.layers): - if reference_points.shape[-1] == 4: - reference_points_input = reference_points[:, :, None] * \ - torch.cat([valid_ratios, valid_ratios], -1)[:, None] - else: - assert reference_points.shape[-1] == 2 - reference_points_input = reference_points[:, :, None] * \ - valid_ratios[:, None] - output = layer( - output, - *args, - reference_points=reference_points_input, - **kwargs) - output = output.permute(1, 0, 2) - - if reg_branches is not None: - tmp = reg_branches[lid](output) - if reference_points.shape[-1] == 4: - new_reference_points = tmp + inverse_sigmoid( - reference_points) - new_reference_points = new_reference_points.sigmoid() - else: - assert reference_points.shape[-1] == 2 - new_reference_points = tmp - new_reference_points[..., :2] = tmp[ - ..., :2] + inverse_sigmoid(reference_points) - new_reference_points = new_reference_points.sigmoid() - reference_points = new_reference_points.detach() - - output = output.permute(1, 0, 2) - if self.return_intermediate: - intermediate.append(output) - intermediate_reference_points.append(reference_points) - - if self.return_intermediate: - return torch.stack(intermediate), torch.stack( - intermediate_reference_points) - - return output, reference_points - - -@MODELS.register_module(force=True) -class DeformableDetrTransformer(Transformer): - """Implements the DeformableDETR transformer. - - Args: - as_two_stage (bool): Generate query from encoder features. - Default: False. - num_feature_levels (int): Number of feature maps from FPN: - Default: 4. - two_stage_num_proposals (int): Number of proposals when set - `as_two_stage` as True. Default: 300. - """ - - def __init__(self, - as_two_stage=False, - num_feature_levels=4, - two_stage_num_proposals=300, - **kwargs): - super(DeformableDetrTransformer, self).__init__(**kwargs) - self.as_two_stage = as_two_stage - self.num_feature_levels = num_feature_levels - self.two_stage_num_proposals = two_stage_num_proposals - self.embed_dims = self.encoder.embed_dims - self.init_layers() - - def init_layers(self): - """Initialize layers of the DeformableDetrTransformer.""" - self.level_embeds = nn.Parameter( - torch.Tensor(self.num_feature_levels, self.embed_dims)) - - if self.as_two_stage: - self.enc_output = nn.Linear(self.embed_dims, self.embed_dims) - self.enc_output_norm = nn.LayerNorm(self.embed_dims) - self.pos_trans = nn.Linear(self.embed_dims * 2, - self.embed_dims * 2) - self.pos_trans_norm = nn.LayerNorm(self.embed_dims * 2) - else: - self.reference_points = nn.Linear(self.embed_dims, 2) - - def init_weights(self): - """Initialize the transformer weights.""" - for p in self.parameters(): - if p.dim() > 1: - nn.init.xavier_uniform_(p) - for m in self.modules(): - if isinstance(m, MultiScaleDeformableAttention): - m.init_weights() - if not self.as_two_stage: - xavier_init(self.reference_points, distribution='uniform', bias=0.) - normal_(self.level_embeds) - - def gen_encoder_output_proposals(self, memory, memory_padding_mask, - spatial_shapes): - """Generate proposals from encoded memory. - - Args: - memory (Tensor) : The output of encoder, - has shape (bs, num_key, embed_dim). num_key is - equal the number of points on feature map from - all level. - memory_padding_mask (Tensor): Padding mask for memory. - has shape (bs, num_key). - spatial_shapes (Tensor): The shape of all feature maps. - has shape (num_level, 2). - - Returns: - tuple: A tuple of feature map and bbox prediction. - - - output_memory (Tensor): The input of decoder, \ - has shape (bs, num_key, embed_dim). num_key is \ - equal the number of points on feature map from \ - all levels. - - output_proposals (Tensor): The normalized proposal \ - after a inverse sigmoid, has shape \ - (bs, num_keys, 4). - """ - - N, S, C = memory.shape - proposals = [] - _cur = 0 - for lvl, (H, W) in enumerate(spatial_shapes): - mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H * W)].view( - N, H, W, 1) - valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1) - valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1) - - grid_y, grid_x = torch.meshgrid( - torch.linspace( - 0, H - 1, H, dtype=torch.float32, device=memory.device), - torch.linspace( - 0, W - 1, W, dtype=torch.float32, device=memory.device)) - grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) - - scale = torch.cat([valid_W.unsqueeze(-1), - valid_H.unsqueeze(-1)], 1).view(N, 1, 1, 2) - grid = (grid.unsqueeze(0).expand(N, -1, -1, -1) + 0.5) / scale - wh = torch.ones_like(grid) * 0.05 * (2.0**lvl) - proposal = torch.cat((grid, wh), -1).view(N, -1, 4) - proposals.append(proposal) - _cur += (H * W) - output_proposals = torch.cat(proposals, 1) - output_proposals_valid = ((output_proposals > 0.01) & - (output_proposals < 0.99)).all( - -1, keepdim=True) - output_proposals = torch.log(output_proposals / (1 - output_proposals)) - output_proposals = output_proposals.masked_fill( - memory_padding_mask.unsqueeze(-1), float('inf')) - output_proposals = output_proposals.masked_fill( - ~output_proposals_valid, float('inf')) - - output_memory = memory - output_memory = output_memory.masked_fill( - memory_padding_mask.unsqueeze(-1), float(0)) - output_memory = output_memory.masked_fill(~output_proposals_valid, - float(0)) - output_memory = self.enc_output_norm(self.enc_output(output_memory)) - return output_memory, output_proposals - - @staticmethod - def get_reference_points(spatial_shapes, valid_ratios, device): - """Get the reference points used in decoder. - - Args: - spatial_shapes (Tensor): The shape of all - feature maps, has shape (num_level, 2). - valid_ratios (Tensor): The radios of valid - points on the feature map, has shape - (bs, num_levels, 2) - device (obj:`device`): The device where - reference_points should be. - - Returns: - Tensor: reference points used in decoder, has \ - shape (bs, num_keys, num_levels, 2). - """ - reference_points_list = [] - for lvl, (H, W) in enumerate(spatial_shapes): - ref_y, ref_x = torch.meshgrid( - torch.linspace( - 0.5, H - 0.5, H, dtype=torch.float32, device=device), - torch.linspace( - 0.5, W - 0.5, W, dtype=torch.float32, device=device)) - ref_y = ref_y.reshape(-1)[None] / ( - valid_ratios[:, None, lvl, 1] * H) - ref_x = ref_x.reshape(-1)[None] / ( - valid_ratios[:, None, lvl, 0] * W) - ref = torch.stack((ref_x, ref_y), -1) - reference_points_list.append(ref) - reference_points = torch.cat(reference_points_list, 1) - reference_points = reference_points[:, :, None] * valid_ratios[:, None] - return reference_points - - def get_valid_ratio(self, mask): - """Get the valid radios of feature maps of all level.""" - _, H, W = mask.shape - valid_H = torch.sum(~mask[:, :, 0], 1) - valid_W = torch.sum(~mask[:, 0, :], 1) - valid_ratio_h = valid_H.float() / H - valid_ratio_w = valid_W.float() / W - valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1) - return valid_ratio - - def get_proposal_pos_embed(self, - proposals, - num_pos_feats=128, - temperature=10000): - """Get the position embedding of proposal.""" - scale = 2 * math.pi - dim_t = torch.arange( - num_pos_feats, dtype=torch.float32, device=proposals.device) - dim_t = temperature**(2 * (dim_t // 2) / num_pos_feats) - # N, L, 4 - proposals = proposals.sigmoid() * scale - # N, L, 4, 128 - pos = proposals[:, :, :, None] / dim_t - # N, L, 4, 64, 2 - pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), - dim=4).flatten(2) - return pos - - def forward(self, - mlvl_feats, - mlvl_masks, - query_embed, - mlvl_pos_embeds, - reg_branches=None, - cls_branches=None, - **kwargs): - """Forward function for `Transformer`. - - Args: - mlvl_feats (list(Tensor)): Input queries from - different level. Each element has shape - [bs, embed_dims, h, w]. - mlvl_masks (list(Tensor)): The key_padding_mask from - different level used for encoder and decoder, - each element has shape [bs, h, w]. - query_embed (Tensor): The query embedding for decoder, - with shape [num_query, c]. - mlvl_pos_embeds (list(Tensor)): The positional encoding - of feats from different level, has the shape - [bs, embed_dims, h, w]. - reg_branches (obj:`nn.ModuleList`): Regression heads for - feature maps from each decoder layer. Only would - be passed when - `with_box_refine` is True. Default to None. - cls_branches (obj:`nn.ModuleList`): Classification heads - for feature maps from each decoder layer. Only would - be passed when `as_two_stage` - is True. Default to None. - - - Returns: - tuple[Tensor]: results of decoder containing the following tensor. - - - inter_states: Outputs from decoder. If - return_intermediate_dec is True output has shape \ - (num_dec_layers, bs, num_query, embed_dims), else has \ - shape (1, bs, num_query, embed_dims). - - init_reference_out: The initial value of reference \ - points, has shape (bs, num_queries, 4). - - inter_references_out: The internal value of reference \ - points in decoder, has shape \ - (num_dec_layers, bs,num_query, embed_dims) - - enc_outputs_class: The classification score of \ - proposals generated from \ - encoder's feature maps, has shape \ - (batch, h*w, num_classes). \ - Only would be returned when `as_two_stage` is True, \ - otherwise None. - - enc_outputs_coord_unact: The regression results \ - generated from encoder's feature maps., has shape \ - (batch, h*w, 4). Only would \ - be returned when `as_two_stage` is True, \ - otherwise None. - """ - assert self.as_two_stage or query_embed is not None - - feat_flatten = [] - mask_flatten = [] - lvl_pos_embed_flatten = [] - spatial_shapes = [] - for lvl, (feat, mask, pos_embed) in enumerate( - zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)): - bs, c, h, w = feat.shape - spatial_shape = (h, w) - spatial_shapes.append(spatial_shape) - feat = feat.flatten(2).transpose(1, 2) - mask = mask.flatten(1) - pos_embed = pos_embed.flatten(2).transpose(1, 2) - lvl_pos_embed = pos_embed + self.level_embeds[lvl].view(1, 1, -1) - lvl_pos_embed_flatten.append(lvl_pos_embed) - feat_flatten.append(feat) - mask_flatten.append(mask) - feat_flatten = torch.cat(feat_flatten, 1) - mask_flatten = torch.cat(mask_flatten, 1) - lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) - spatial_shapes = torch.as_tensor( - spatial_shapes, dtype=torch.long, device=feat_flatten.device) - level_start_index = torch.cat((spatial_shapes.new_zeros( - (1, )), spatial_shapes.prod(1).cumsum(0)[:-1])) - valid_ratios = torch.stack( - [self.get_valid_ratio(m) for m in mlvl_masks], 1) - - reference_points = \ - self.get_reference_points(spatial_shapes, - valid_ratios, - device=feat.device) - - feat_flatten = feat_flatten.permute(1, 0, 2) # (H*W, bs, embed_dims) - lvl_pos_embed_flatten = lvl_pos_embed_flatten.permute( - 1, 0, 2) # (H*W, bs, embed_dims) - memory = self.encoder( - query=feat_flatten, - key=None, - value=None, - query_pos=lvl_pos_embed_flatten, - query_key_padding_mask=mask_flatten, - spatial_shapes=spatial_shapes, - reference_points=reference_points, - level_start_index=level_start_index, - valid_ratios=valid_ratios, - **kwargs) - - memory = memory.permute(1, 0, 2) - bs, _, c = memory.shape - if self.as_two_stage: - output_memory, output_proposals = \ - self.gen_encoder_output_proposals( - memory, mask_flatten, spatial_shapes) - enc_outputs_class = cls_branches[self.decoder.num_layers]( - output_memory) - enc_outputs_coord_unact = \ - reg_branches[ - self.decoder.num_layers](output_memory) + output_proposals - - topk = self.two_stage_num_proposals - # We only use the first channel in enc_outputs_class as foreground, - # the other (num_classes - 1) channels are actually not used. - # Its targets are set to be 0s, which indicates the first - # class (foreground) because we use [0, num_classes - 1] to - # indicate class labels, background class is indicated by - # num_classes (similar convention in RPN). - # See https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/dense_heads/deformable_detr_head.py#L241 # noqa - # This follows the official implementation of Deformable DETR. - topk_proposals = torch.topk( - enc_outputs_class[..., 0], topk, dim=1)[1] - topk_coords_unact = torch.gather( - enc_outputs_coord_unact, 1, - topk_proposals.unsqueeze(-1).repeat(1, 1, 4)) - topk_coords_unact = topk_coords_unact.detach() - reference_points = topk_coords_unact.sigmoid() - init_reference_out = reference_points - pos_trans_out = self.pos_trans_norm( - self.pos_trans(self.get_proposal_pos_embed(topk_coords_unact))) - query_pos, query = torch.split(pos_trans_out, c, dim=2) - else: - query_pos, query = torch.split(query_embed, c, dim=1) - query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1) - query = query.unsqueeze(0).expand(bs, -1, -1) - reference_points = self.reference_points(query_pos).sigmoid() - init_reference_out = reference_points - - # decoder - query = query.permute(1, 0, 2) - memory = memory.permute(1, 0, 2) - query_pos = query_pos.permute(1, 0, 2) - inter_states, inter_references = self.decoder( - query=query, - key=None, - value=memory, - query_pos=query_pos, - key_padding_mask=mask_flatten, - reference_points=reference_points, - spatial_shapes=spatial_shapes, - level_start_index=level_start_index, - valid_ratios=valid_ratios, - reg_branches=reg_branches, - **kwargs) - - inter_references_out = inter_references - if self.as_two_stage: - return inter_states, init_reference_out,\ - inter_references_out, enc_outputs_class,\ - enc_outputs_coord_unact - return inter_states, init_reference_out, \ - inter_references_out, None, None - - -@MODELS.register_module() -class CoDeformableDetrTransformerDecoder(TransformerLayerSequence): - """Implements the decoder in DETR transformer. - - Args: - return_intermediate (bool): Whether to return intermediate outputs. - coder_norm_cfg (dict): Config of last normalization layer. Default: - `LN`. - """ - - def __init__(self, - *args, - return_intermediate=False, - look_forward_twice=False, - **kwargs): - - super(CoDeformableDetrTransformerDecoder, - self).__init__(*args, **kwargs) - self.return_intermediate = return_intermediate - self.look_forward_twice = look_forward_twice - - def forward(self, - query, - *args, - reference_points=None, - valid_ratios=None, - reg_branches=None, - **kwargs): - """Forward function for `TransformerDecoder`. - - Args: - query (Tensor): Input query with shape - `(num_query, bs, embed_dims)`. - reference_points (Tensor): The reference - points of offset. has shape - (bs, num_query, 4) when as_two_stage, - otherwise has shape ((bs, num_query, 2). - valid_ratios (Tensor): The radios of valid - points on the feature map, has shape - (bs, num_levels, 2) - reg_branch: (obj:`nn.ModuleList`): Used for - refining the regression results. Only would - be passed when with_box_refine is True, - otherwise would be passed a `None`. - - Returns: - Tensor: Results with shape [1, num_query, bs, embed_dims] when - return_intermediate is `False`, otherwise it has shape - [num_layers, num_query, bs, embed_dims]. - """ - output = query - intermediate = [] - intermediate_reference_points = [] - for lid, layer in enumerate(self.layers): - if reference_points.shape[-1] == 4: - reference_points_input = reference_points[:, :, None] * \ - torch.cat([valid_ratios, valid_ratios], -1)[:, None] - else: - assert reference_points.shape[-1] == 2 - reference_points_input = reference_points[:, :, None] * \ - valid_ratios[:, None] - output = layer( - output, - *args, - reference_points=reference_points_input, - **kwargs) - output = output.permute(1, 0, 2) - - if reg_branches is not None: - tmp = reg_branches[lid](output) - if reference_points.shape[-1] == 4: - new_reference_points = tmp + inverse_sigmoid( - reference_points) - new_reference_points = new_reference_points.sigmoid() - else: - assert reference_points.shape[-1] == 2 - new_reference_points = tmp - new_reference_points[..., :2] = tmp[ - ..., :2] + inverse_sigmoid(reference_points) - new_reference_points = new_reference_points.sigmoid() - reference_points = new_reference_points.detach() - - output = output.permute(1, 0, 2) - if self.return_intermediate: - intermediate.append(output) - intermediate_reference_points.append( - new_reference_points if self. - look_forward_twice else reference_points) - if self.return_intermediate: - return torch.stack(intermediate), torch.stack( - intermediate_reference_points) - - return output, reference_points - - -@MODELS.register_module() -class CoDeformableDetrTransformer(DeformableDetrTransformer): - - def __init__(self, - mixed_selection=True, - with_pos_coord=True, - with_coord_feat=True, - num_co_heads=1, - **kwargs): - self.mixed_selection = mixed_selection - self.with_pos_coord = with_pos_coord - self.with_coord_feat = with_coord_feat - self.num_co_heads = num_co_heads - super(CoDeformableDetrTransformer, self).__init__(**kwargs) - self._init_layers() - - def _init_layers(self): - """Initialize layers of the CoDeformableDetrTransformer.""" - if self.with_pos_coord: - if self.num_co_heads > 0: - # bug: this code should be 'self.head_pos_embed = - # nn.Embedding(self.num_co_heads, self.embed_dims)', - # we keep this bug for reproducing our results with ResNet-50. - # You can fix this bug when reproducing results with - # swin transformer. - self.head_pos_embed = nn.Embedding(self.num_co_heads, 1, 1, - self.embed_dims) - self.aux_pos_trans = nn.ModuleList() - self.aux_pos_trans_norm = nn.ModuleList() - self.pos_feats_trans = nn.ModuleList() - self.pos_feats_norm = nn.ModuleList() - for i in range(self.num_co_heads): - self.aux_pos_trans.append( - nn.Linear(self.embed_dims * 2, self.embed_dims * 2)) - self.aux_pos_trans_norm.append( - nn.LayerNorm(self.embed_dims * 2)) - if self.with_coord_feat: - self.pos_feats_trans.append( - nn.Linear(self.embed_dims, self.embed_dims)) - self.pos_feats_norm.append( - nn.LayerNorm(self.embed_dims)) - - def get_proposal_pos_embed(self, - proposals, - num_pos_feats=128, - temperature=10000): - """Get the position embedding of proposal.""" - num_pos_feats = self.embed_dims // 2 - scale = 2 * math.pi - dim_t = torch.arange( - num_pos_feats, dtype=torch.float32, device=proposals.device) - dim_t = temperature**(2 * (dim_t // 2) / num_pos_feats) - # N, L, 4 - proposals = proposals.sigmoid() * scale - # N, L, 4, 128 - pos = proposals[:, :, :, None] / dim_t - # N, L, 4, 64, 2 - pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), - dim=4).flatten(2) - return pos - - def forward(self, - mlvl_feats, - mlvl_masks, - query_embed, - mlvl_pos_embeds, - reg_branches=None, - cls_branches=None, - return_encoder_output=False, - attn_masks=None, - **kwargs): - """Forward function for `Transformer`. - - Args: - mlvl_feats (list(Tensor)): Input queries from - different level. Each element has shape - [bs, embed_dims, h, w]. - mlvl_masks (list(Tensor)): The key_padding_mask from - different level used for encoder and decoder, - each element has shape [bs, h, w]. - query_embed (Tensor): The query embedding for decoder, - with shape [num_query, c]. - mlvl_pos_embeds (list(Tensor)): The positional encoding - of feats from different level, has the shape - [bs, embed_dims, h, w]. - reg_branches (obj:`nn.ModuleList`): Regression heads for - feature maps from each decoder layer. Only would - be passed when - `with_box_refine` is True. Default to None. - cls_branches (obj:`nn.ModuleList`): Classification heads - for feature maps from each decoder layer. Only would - be passed when `as_two_stage` - is True. Default to None. - - - Returns: - tuple[Tensor]: results of decoder containing the following tensor. - - - inter_states: Outputs from decoder. If - return_intermediate_dec is True output has shape \ - (num_dec_layers, bs, num_query, embed_dims), else has \ - shape (1, bs, num_query, embed_dims). - - init_reference_out: The initial value of reference \ - points, has shape (bs, num_queries, 4). - - inter_references_out: The internal value of reference \ - points in decoder, has shape \ - (num_dec_layers, bs,num_query, embed_dims) - - enc_outputs_class: The classification score of \ - proposals generated from \ - encoder's feature maps, has shape \ - (batch, h*w, num_classes). \ - Only would be returned when `as_two_stage` is True, \ - otherwise None. - - enc_outputs_coord_unact: The regression results \ - generated from encoder's feature maps., has shape \ - (batch, h*w, 4). Only would \ - be returned when `as_two_stage` is True, \ - otherwise None. - """ - assert self.as_two_stage or query_embed is not None - - feat_flatten = [] - mask_flatten = [] - lvl_pos_embed_flatten = [] - spatial_shapes = [] - for lvl, (feat, mask, pos_embed) in enumerate( - zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)): - bs, c, h, w = feat.shape - spatial_shape = (h, w) - spatial_shapes.append(spatial_shape) - feat = feat.flatten(2).transpose(1, 2) - mask = mask.flatten(1) - pos_embed = pos_embed.flatten(2).transpose(1, 2) - lvl_pos_embed = pos_embed + self.level_embeds[lvl].view(1, 1, -1) - lvl_pos_embed_flatten.append(lvl_pos_embed) - feat_flatten.append(feat) - mask_flatten.append(mask) - feat_flatten = torch.cat(feat_flatten, 1) - mask_flatten = torch.cat(mask_flatten, 1) - lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) - spatial_shapes = torch.as_tensor( - spatial_shapes, dtype=torch.long, device=feat_flatten.device) - level_start_index = torch.cat((spatial_shapes.new_zeros( - (1, )), spatial_shapes.prod(1).cumsum(0)[:-1])) - valid_ratios = torch.stack( - [self.get_valid_ratio(m) for m in mlvl_masks], 1) - - reference_points = \ - self.get_reference_points(spatial_shapes, - valid_ratios, - device=feat.device) - - feat_flatten = feat_flatten.permute(1, 0, 2) # (H*W, bs, embed_dims) - lvl_pos_embed_flatten = lvl_pos_embed_flatten.permute( - 1, 0, 2) # (H*W, bs, embed_dims) - memory = self.encoder( - query=feat_flatten, - key=None, - value=None, - query_pos=lvl_pos_embed_flatten, - query_key_padding_mask=mask_flatten, - spatial_shapes=spatial_shapes, - reference_points=reference_points, - level_start_index=level_start_index, - valid_ratios=valid_ratios, - **kwargs) - - memory = memory.permute(1, 0, 2) - bs, _, c = memory.shape - if self.as_two_stage: - output_memory, output_proposals = \ - self.gen_encoder_output_proposals( - memory, mask_flatten, spatial_shapes) - enc_outputs_class = cls_branches[self.decoder.num_layers]( - output_memory) - enc_outputs_coord_unact = \ - reg_branches[ - self.decoder.num_layers](output_memory) + output_proposals - - topk = self.two_stage_num_proposals - topk = query_embed.shape[0] - topk_proposals = torch.topk( - enc_outputs_class[..., 0], topk, dim=1)[1] - topk_coords_unact = torch.gather( - enc_outputs_coord_unact, 1, - topk_proposals.unsqueeze(-1).repeat(1, 1, 4)) - topk_coords_unact = topk_coords_unact.detach() - reference_points = topk_coords_unact.sigmoid() - init_reference_out = reference_points - pos_trans_out = self.pos_trans_norm( - self.pos_trans(self.get_proposal_pos_embed(topk_coords_unact))) - - if not self.mixed_selection: - query_pos, query = torch.split(pos_trans_out, c, dim=2) - else: - # query_embed here is the content embed for deformable DETR - query = query_embed.unsqueeze(0).expand(bs, -1, -1) - query_pos, _ = torch.split(pos_trans_out, c, dim=2) - else: - query_pos, query = torch.split(query_embed, c, dim=1) - query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1) - query = query.unsqueeze(0).expand(bs, -1, -1) - reference_points = self.reference_points(query_pos).sigmoid() - init_reference_out = reference_points - - # decoder - query = query.permute(1, 0, 2) - memory = memory.permute(1, 0, 2) - query_pos = query_pos.permute(1, 0, 2) - inter_states, inter_references = self.decoder( - query=query, - key=None, - value=memory, - query_pos=query_pos, - key_padding_mask=mask_flatten, - reference_points=reference_points, - spatial_shapes=spatial_shapes, - level_start_index=level_start_index, - valid_ratios=valid_ratios, - reg_branches=reg_branches, - attn_masks=attn_masks, - **kwargs) - - inter_references_out = inter_references - if self.as_two_stage: - if return_encoder_output: - return inter_states, init_reference_out,\ - inter_references_out, enc_outputs_class,\ - enc_outputs_coord_unact, memory - return inter_states, init_reference_out,\ - inter_references_out, enc_outputs_class,\ - enc_outputs_coord_unact - if return_encoder_output: - return inter_states, init_reference_out, \ - inter_references_out, None, None, memory - return inter_states, init_reference_out, \ - inter_references_out, None, None - - def forward_aux(self, - mlvl_feats, - mlvl_masks, - query_embed, - mlvl_pos_embeds, - pos_anchors, - pos_feats=None, - reg_branches=None, - cls_branches=None, - return_encoder_output=False, - attn_masks=None, - head_idx=0, - **kwargs): - feat_flatten = [] - mask_flatten = [] - spatial_shapes = [] - for lvl, (feat, mask, pos_embed) in enumerate( - zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)): - bs, c, h, w = feat.shape - spatial_shape = (h, w) - spatial_shapes.append(spatial_shape) - feat = feat.flatten(2).transpose(1, 2) - mask = mask.flatten(1) - feat_flatten.append(feat) - mask_flatten.append(mask) - feat_flatten = torch.cat(feat_flatten, 1) - mask_flatten = torch.cat(mask_flatten, 1) - spatial_shapes = torch.as_tensor( - spatial_shapes, dtype=torch.long, device=feat_flatten.device) - level_start_index = torch.cat((spatial_shapes.new_zeros( - (1, )), spatial_shapes.prod(1).cumsum(0)[:-1])) - valid_ratios = torch.stack( - [self.get_valid_ratio(m) for m in mlvl_masks], 1) - - feat_flatten = feat_flatten.permute(1, 0, 2) # (H*W, bs, embed_dims) - - memory = feat_flatten - memory = memory.permute(1, 0, 2) - bs, _, c = memory.shape - - topk_coords_unact = inverse_sigmoid(pos_anchors) - reference_points = pos_anchors - init_reference_out = reference_points - if self.num_co_heads > 0: - pos_trans_out = self.aux_pos_trans_norm[head_idx]( - self.aux_pos_trans[head_idx]( - self.get_proposal_pos_embed(topk_coords_unact))) - query_pos, query = torch.split(pos_trans_out, c, dim=2) - if self.with_coord_feat: - query = query + self.pos_feats_norm[head_idx]( - self.pos_feats_trans[head_idx](pos_feats)) - query_pos = query_pos + self.head_pos_embed.weight[head_idx] - - # decoder - query = query.permute(1, 0, 2) - memory = memory.permute(1, 0, 2) - query_pos = query_pos.permute(1, 0, 2) - inter_states, inter_references = self.decoder( - query=query, - key=None, - value=memory, - query_pos=query_pos, - key_padding_mask=mask_flatten, - reference_points=reference_points, - spatial_shapes=spatial_shapes, - level_start_index=level_start_index, - valid_ratios=valid_ratios, - reg_branches=reg_branches, - attn_masks=attn_masks, - **kwargs) - - inter_references_out = inter_references - return inter_states, init_reference_out, \ - inter_references_out - - -def build_MLP(input_dim, hidden_dim, output_dim, num_layers): - assert num_layers > 1, \ - f'num_layers should be greater than 1 but got {num_layers}' - h = [hidden_dim] * (num_layers - 1) - layers = list() - for n, k in zip([input_dim] + h[:-1], h): - layers.extend((nn.Linear(n, k), nn.ReLU())) - # Note that the relu func of MLP in original DETR repo is set - # 'inplace=False', however the ReLU cfg of FFN in mmdet is set - # 'inplace=True' by default. - layers.append(nn.Linear(hidden_dim, output_dim)) - return nn.Sequential(*layers) - - -@MODELS.register_module() -class DinoTransformerDecoder(DeformableDetrTransformerDecoder): - - def __init__(self, *args, **kwargs): - super(DinoTransformerDecoder, self).__init__(*args, **kwargs) - self._init_layers() - - def _init_layers(self): - self.ref_point_head = build_MLP(self.embed_dims * 2, self.embed_dims, - self.embed_dims, 2) - self.norm = nn.LayerNorm(self.embed_dims) - - @staticmethod - def gen_sineembed_for_position(pos_tensor, pos_feat): - # n_query, bs, _ = pos_tensor.size() - # sineembed_tensor = torch.zeros(n_query, bs, 256) - scale = 2 * math.pi - dim_t = torch.arange( - pos_feat, dtype=torch.float32, device=pos_tensor.device) - dim_t = 10000**(2 * (dim_t // 2) / pos_feat) - x_embed = pos_tensor[:, :, 0] * scale - y_embed = pos_tensor[:, :, 1] * scale - pos_x = x_embed[:, :, None] / dim_t - pos_y = y_embed[:, :, None] / dim_t - pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), - dim=3).flatten(2) - pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), - dim=3).flatten(2) - if pos_tensor.size(-1) == 2: - pos = torch.cat((pos_y, pos_x), dim=2) - elif pos_tensor.size(-1) == 4: - w_embed = pos_tensor[:, :, 2] * scale - pos_w = w_embed[:, :, None] / dim_t - pos_w = torch.stack( - (pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), - dim=3).flatten(2) - - h_embed = pos_tensor[:, :, 3] * scale - pos_h = h_embed[:, :, None] / dim_t - pos_h = torch.stack( - (pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), - dim=3).flatten(2) - - pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2) - else: - raise ValueError('Unknown pos_tensor shape(-1):{}'.format( - pos_tensor.size(-1))) - return pos - - def forward(self, - query, - *args, - reference_points=None, - valid_ratios=None, - reg_branches=None, - **kwargs): - output = query - intermediate = [] - intermediate_reference_points = [reference_points] - for lid, layer in enumerate(self.layers): - if reference_points.shape[-1] == 4: - reference_points_input = \ - reference_points[:, :, None] * torch.cat( - [valid_ratios, valid_ratios], -1)[:, None] - else: - assert reference_points.shape[-1] == 2 - reference_points_input = \ - reference_points[:, :, None] * valid_ratios[:, None] - - query_sine_embed = self.gen_sineembed_for_position( - reference_points_input[:, :, 0, :], self.embed_dims // 2) - query_pos = self.ref_point_head(query_sine_embed) - - query_pos = query_pos.permute(1, 0, 2) - output = layer( - output, - *args, - query_pos=query_pos, - reference_points=reference_points_input, - **kwargs) - output = output.permute(1, 0, 2) - - if reg_branches is not None: - tmp = reg_branches[lid](output) - assert reference_points.shape[-1] == 4 - new_reference_points = tmp + inverse_sigmoid( - reference_points, eps=1e-3) - new_reference_points = new_reference_points.sigmoid() - reference_points = new_reference_points.detach() - - output = output.permute(1, 0, 2) - if self.return_intermediate: - intermediate.append(self.norm(output)) - intermediate_reference_points.append(new_reference_points) - # NOTE this is for the "Look Forward Twice" module, - # in the DeformDETR, reference_points was appended. - - if self.return_intermediate: - return torch.stack(intermediate), torch.stack( - intermediate_reference_points) - - return output, reference_points - - -@MODELS.register_module() -class CoDinoTransformer(CoDeformableDetrTransformer): - - def __init__(self, *args, **kwargs): - super(CoDinoTransformer, self).__init__(*args, **kwargs) - - def init_layers(self): - """Initialize layers of the DinoTransformer.""" - self.level_embeds = nn.Parameter( - torch.Tensor(self.num_feature_levels, self.embed_dims)) - self.enc_output = nn.Linear(self.embed_dims, self.embed_dims) - self.enc_output_norm = nn.LayerNorm(self.embed_dims) - self.query_embed = nn.Embedding(self.two_stage_num_proposals, - self.embed_dims) - - def _init_layers(self): - if self.with_pos_coord: - if self.num_co_heads > 0: - self.aux_pos_trans = nn.ModuleList() - self.aux_pos_trans_norm = nn.ModuleList() - self.pos_feats_trans = nn.ModuleList() - self.pos_feats_norm = nn.ModuleList() - for i in range(self.num_co_heads): - self.aux_pos_trans.append( - nn.Linear(self.embed_dims * 2, self.embed_dims)) - self.aux_pos_trans_norm.append( - nn.LayerNorm(self.embed_dims)) - if self.with_coord_feat: - self.pos_feats_trans.append( - nn.Linear(self.embed_dims, self.embed_dims)) - self.pos_feats_norm.append( - nn.LayerNorm(self.embed_dims)) - - def init_weights(self): - super().init_weights() - nn.init.normal_(self.query_embed.weight.data) - - def forward(self, - mlvl_feats, - mlvl_masks, - query_embed, - mlvl_pos_embeds, - dn_label_query, - dn_bbox_query, - attn_mask, - reg_branches=None, - cls_branches=None, - **kwargs): - assert self.as_two_stage and query_embed is None, \ - 'as_two_stage must be True for DINO' - - feat_flatten = [] - mask_flatten = [] - lvl_pos_embed_flatten = [] - spatial_shapes = [] - for lvl, (feat, mask, pos_embed) in enumerate( - zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)): - bs, c, h, w = feat.shape - spatial_shape = (h, w) - spatial_shapes.append(spatial_shape) - feat = feat.flatten(2).transpose(1, 2) - mask = mask.flatten(1) - pos_embed = pos_embed.flatten(2).transpose(1, 2) - lvl_pos_embed = pos_embed + self.level_embeds[lvl].view(1, 1, -1) - lvl_pos_embed_flatten.append(lvl_pos_embed) - feat_flatten.append(feat) - mask_flatten.append(mask) - feat_flatten = torch.cat(feat_flatten, 1) - mask_flatten = torch.cat(mask_flatten, 1) - lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) - spatial_shapes = torch.as_tensor( - spatial_shapes, dtype=torch.long, device=feat_flatten.device) - level_start_index = torch.cat((spatial_shapes.new_zeros( - (1, )), spatial_shapes.prod(1).cumsum(0)[:-1])) - valid_ratios = torch.stack( - [self.get_valid_ratio(m) for m in mlvl_masks], 1) - - reference_points = self.get_reference_points( - spatial_shapes, valid_ratios, device=feat.device) - - feat_flatten = feat_flatten.permute(1, 0, 2) # (H*W, bs, embed_dims) - lvl_pos_embed_flatten = lvl_pos_embed_flatten.permute( - 1, 0, 2) # (H*W, bs, embed_dims) - memory = self.encoder( - query=feat_flatten, - key=None, - value=None, - query_pos=lvl_pos_embed_flatten, - query_key_padding_mask=mask_flatten, - spatial_shapes=spatial_shapes, - reference_points=reference_points, - level_start_index=level_start_index, - valid_ratios=valid_ratios, - **kwargs) - memory = memory.permute(1, 0, 2) - bs, _, c = memory.shape - - output_memory, output_proposals = self.gen_encoder_output_proposals( - memory, mask_flatten, spatial_shapes) - enc_outputs_class = cls_branches[self.decoder.num_layers]( - output_memory) - enc_outputs_coord_unact = reg_branches[self.decoder.num_layers]( - output_memory) + output_proposals - cls_out_features = cls_branches[self.decoder.num_layers].out_features - topk = self.two_stage_num_proposals - # NOTE In DeformDETR, enc_outputs_class[..., 0] is used for topk - topk_indices = torch.topk(enc_outputs_class.max(-1)[0], topk, dim=1)[1] - - topk_score = torch.gather( - enc_outputs_class, 1, - topk_indices.unsqueeze(-1).repeat(1, 1, cls_out_features)) - topk_coords_unact = torch.gather( - enc_outputs_coord_unact, 1, - topk_indices.unsqueeze(-1).repeat(1, 1, 4)) - topk_anchor = topk_coords_unact.sigmoid() - topk_coords_unact = topk_coords_unact.detach() - - query = self.query_embed.weight[:, None, :].repeat(1, bs, - 1).transpose(0, 1) - # NOTE the query_embed here is not spatial query as in DETR. - # It is actually content query, which is named tgt in other - # DETR-like models - if dn_label_query is not None: - query = torch.cat([dn_label_query, query], dim=1) - if dn_bbox_query is not None: - reference_points = torch.cat([dn_bbox_query, topk_coords_unact], - dim=1) - else: - reference_points = topk_coords_unact - reference_points = reference_points.sigmoid() - # decoder - query = query.permute(1, 0, 2) - memory = memory.permute(1, 0, 2) - inter_states, inter_references = self.decoder( - query=query, - key=None, - value=memory, - attn_masks=attn_mask, - key_padding_mask=mask_flatten, - reference_points=reference_points, - spatial_shapes=spatial_shapes, - level_start_index=level_start_index, - valid_ratios=valid_ratios, - reg_branches=reg_branches, - **kwargs) - - inter_references_out = inter_references - - return inter_states, inter_references_out, \ - topk_score, topk_anchor, memory - - def forward_aux(self, - mlvl_feats, - mlvl_masks, - query_embed, - mlvl_pos_embeds, - pos_anchors, - pos_feats=None, - reg_branches=None, - cls_branches=None, - return_encoder_output=False, - attn_masks=None, - head_idx=0, - **kwargs): - feat_flatten = [] - mask_flatten = [] - spatial_shapes = [] - for lvl, (feat, mask, pos_embed) in enumerate( - zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)): - bs, c, h, w = feat.shape - spatial_shape = (h, w) - spatial_shapes.append(spatial_shape) - feat = feat.flatten(2).transpose(1, 2) - mask = mask.flatten(1) - feat_flatten.append(feat) - mask_flatten.append(mask) - feat_flatten = torch.cat(feat_flatten, 1) - mask_flatten = torch.cat(mask_flatten, 1) - spatial_shapes = torch.as_tensor( - spatial_shapes, dtype=torch.long, device=feat_flatten.device) - level_start_index = torch.cat((spatial_shapes.new_zeros( - (1, )), spatial_shapes.prod(1).cumsum(0)[:-1])) - valid_ratios = torch.stack( - [self.get_valid_ratio(m) for m in mlvl_masks], 1) - - feat_flatten = feat_flatten.permute(1, 0, 2) # (H*W, bs, embed_dims) - - memory = feat_flatten - memory = memory.permute(1, 0, 2) - bs, _, c = memory.shape - - topk_coords_unact = inverse_sigmoid(pos_anchors) - reference_points = pos_anchors - if self.num_co_heads > 0: - pos_trans_out = self.aux_pos_trans_norm[head_idx]( - self.aux_pos_trans[head_idx]( - self.get_proposal_pos_embed(topk_coords_unact))) - query = pos_trans_out - if self.with_coord_feat: - query = query + self.pos_feats_norm[head_idx]( - self.pos_feats_trans[head_idx](pos_feats)) - - # decoder - query = query.permute(1, 0, 2) - memory = memory.permute(1, 0, 2) - inter_states, inter_references = self.decoder( - query=query, - key=None, - value=memory, - attn_masks=None, - key_padding_mask=mask_flatten, - reference_points=reference_points, - spatial_shapes=spatial_shapes, - level_start_index=level_start_index, - valid_ratios=valid_ratios, - reg_branches=reg_branches, - **kwargs) - - inter_references_out = inter_references - - return inter_states, inter_references_out - - -@MODELS.register_module() -class DetrTransformerEncoder(TransformerLayerSequence): - """TransformerEncoder of DETR. - - Args: - post_norm_cfg (dict): Config of last normalization layer. Default: - `LN`. Only used when `self.pre_norm` is `True` - """ - - def __init__(self, - *args, - post_norm_cfg=dict(type='LN'), - with_cp=-1, - **kwargs): - super(DetrTransformerEncoder, self).__init__(*args, **kwargs) - if post_norm_cfg is not None: - self.post_norm = build_norm_layer( - post_norm_cfg, self.embed_dims)[1] if self.pre_norm else None - else: - assert not self.pre_norm, f'Use prenorm in ' \ - f'{self.__class__.__name__},' \ - f'Please specify post_norm_cfg' - self.post_norm = None - self.with_cp = with_cp - if self.with_cp > 0: - if checkpoint_wrapper is None: - warnings.warn('If you want to reduce GPU memory usage, \ - please install fairscale by executing the \ - following command: pip install fairscale.') - return - for i in range(self.with_cp): - self.layers[i] = checkpoint_wrapper(self.layers[i]) - - -@MODELS.register_module() -class DetrTransformerDecoderLayer(BaseTransformerLayer): - """Implements decoder layer in DETR transformer. - - Args: - attn_cfgs (list[`mmcv.ConfigDict`] | list[dict] | dict )): - Configs for self_attention or cross_attention, the order - should be consistent with it in `operation_order`. If it is - a dict, it would be expand to the number of attention in - `operation_order`. - feedforward_channels (int): The hidden dimension for FFNs. - ffn_dropout (float): Probability of an element to be zeroed - in ffn. Default 0.0. - operation_order (tuple[str]): The execution order of operation - in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm'). - Default:None - act_cfg (dict): The activation config for FFNs. Default: `LN` - norm_cfg (dict): Config dict for normalization layer. - Default: `LN`. - ffn_num_fcs (int): The number of fully-connected layers in FFNs. - Default:2. - """ - - def __init__(self, - attn_cfgs, - feedforward_channels, - ffn_dropout=0.0, - operation_order=None, - act_cfg=dict(type='ReLU', inplace=True), - norm_cfg=dict(type='LN'), - ffn_num_fcs=2, - **kwargs): - super(DetrTransformerDecoderLayer, self).__init__( - attn_cfgs=attn_cfgs, - feedforward_channels=feedforward_channels, - ffn_dropout=ffn_dropout, - operation_order=operation_order, - act_cfg=act_cfg, - norm_cfg=norm_cfg, - ffn_num_fcs=ffn_num_fcs, - **kwargs) - assert len(operation_order) == 6 - assert set(operation_order) == set( - ['self_attn', 'norm', 'cross_attn', 'ffn']) diff --git a/projects/CO-DETR/configs/codino/co_dino_5scale_r50_8xb2_1x_coco.py b/projects/CO-DETR/configs/codino/co_dino_5scale_r50_8xb2_1x_coco.py deleted file mode 100644 index 1a4130437..000000000 --- a/projects/CO-DETR/configs/codino/co_dino_5scale_r50_8xb2_1x_coco.py +++ /dev/null @@ -1,68 +0,0 @@ -_base_ = './co_dino_5scale_r50_lsj_8xb2_1x_coco.py' - -model = dict( - use_lsj=False, data_preprocessor=dict(pad_mask=False, batch_augments=None)) - -# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different -# from the default setting in mmdet. -train_pipeline = [ - dict(type='LoadImageFromFile', backend_args=_base_.backend_args), - dict(type='LoadAnnotations', with_bbox=True), - dict(type='RandomFlip', prob=0.5), - dict( - type='RandomChoice', - transforms=[ - [ - dict( - type='RandomChoiceResize', - scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), - (608, 1333), (640, 1333), (672, 1333), (704, 1333), - (736, 1333), (768, 1333), (800, 1333)], - keep_ratio=True) - ], - [ - dict( - type='RandomChoiceResize', - # The radio of all image in train dataset < 7 - # follow the original implement - scales=[(400, 4200), (500, 4200), (600, 4200)], - keep_ratio=True), - dict( - type='RandomCrop', - crop_type='absolute_range', - crop_size=(384, 600), - allow_negative_crop=True), - dict( - type='RandomChoiceResize', - scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), - (608, 1333), (640, 1333), (672, 1333), (704, 1333), - (736, 1333), (768, 1333), (800, 1333)], - keep_ratio=True) - ] - ]), - dict(type='PackDetInputs') -] - -train_dataloader = dict( - dataset=dict( - _delete_=True, - type=_base_.dataset_type, - data_root=_base_.data_root, - ann_file='annotations/instances_train2017.json', - data_prefix=dict(img='train2017/'), - filter_cfg=dict(filter_empty_gt=False, min_size=32), - pipeline=train_pipeline, - backend_args=_base_.backend_args)) - -test_pipeline = [ - dict(type='LoadImageFromFile', backend_args=_base_.backend_args), - dict(type='Resize', scale=(1333, 800), keep_ratio=True), - dict(type='LoadAnnotations', with_bbox=True), - dict( - type='PackDetInputs', - meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', - 'scale_factor')) -] - -val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) -test_dataloader = val_dataloader diff --git a/projects/CO-DETR/configs/codino/co_dino_5scale_r50_lsj_8xb2_1x_coco.py b/projects/CO-DETR/configs/codino/co_dino_5scale_r50_lsj_8xb2_1x_coco.py deleted file mode 100644 index 876b90f89..000000000 --- a/projects/CO-DETR/configs/codino/co_dino_5scale_r50_lsj_8xb2_1x_coco.py +++ /dev/null @@ -1,359 +0,0 @@ -_base_ = 'mmdet::common/ssj_scp_270k_coco-instance.py' - -custom_imports = dict( - imports=['projects.CO-DETR.codetr'], allow_failed_imports=False) - -# model settings -num_dec_layer = 6 -loss_lambda = 2.0 -num_classes = 80 - -image_size = (1024, 1024) -batch_augments = [ - dict(type='BatchFixedSizePad', size=image_size, pad_mask=True) -] -model = dict( - type='CoDETR', - # If using the lsj augmentation, - # it is recommended to set it to True. - use_lsj=True, - # detr: 52.1 - # one-stage: 49.4 - # two-stage: 47.9 - eval_module='detr', # in ['detr', 'one-stage', 'two-stage'] - data_preprocessor=dict( - type='DetDataPreprocessor', - mean=[123.675, 116.28, 103.53], - std=[58.395, 57.12, 57.375], - bgr_to_rgb=True, - pad_mask=True, - batch_augments=batch_augments), - backbone=dict( - type='ResNet', - depth=50, - num_stages=4, - out_indices=(0, 1, 2, 3), - frozen_stages=1, - norm_cfg=dict(type='BN', requires_grad=False), - norm_eval=True, - style='pytorch', - init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), - neck=dict( - type='ChannelMapper', - in_channels=[256, 512, 1024, 2048], - kernel_size=1, - out_channels=256, - act_cfg=None, - norm_cfg=dict(type='GN', num_groups=32), - num_outs=5), - query_head=dict( - type='CoDINOHead', - num_query=900, - num_classes=num_classes, - in_channels=2048, - as_two_stage=True, - dn_cfg=dict( - label_noise_scale=0.5, - box_noise_scale=1.0, - group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=100)), - transformer=dict( - type='CoDinoTransformer', - with_coord_feat=False, - num_co_heads=2, # ATSS Aux Head + Faster RCNN Aux Head - num_feature_levels=5, - encoder=dict( - type='DetrTransformerEncoder', - num_layers=6, - # number of layers that use checkpoint. - # The maximum value for the setting is num_layers. - # FairScale must be installed for it to work. - with_cp=4, - transformerlayers=dict( - type='BaseTransformerLayer', - attn_cfgs=dict( - type='MultiScaleDeformableAttention', - embed_dims=256, - num_levels=5, - dropout=0.0), - feedforward_channels=2048, - ffn_dropout=0.0, - operation_order=('self_attn', 'norm', 'ffn', 'norm'))), - decoder=dict( - type='DinoTransformerDecoder', - num_layers=6, - return_intermediate=True, - transformerlayers=dict( - type='DetrTransformerDecoderLayer', - attn_cfgs=[ - dict( - type='MultiheadAttention', - embed_dims=256, - num_heads=8, - dropout=0.0), - dict( - type='MultiScaleDeformableAttention', - embed_dims=256, - num_levels=5, - dropout=0.0), - ], - feedforward_channels=2048, - ffn_dropout=0.0, - operation_order=('self_attn', 'norm', 'cross_attn', 'norm', - 'ffn', 'norm')))), - positional_encoding=dict( - type='SinePositionalEncoding', - num_feats=128, - temperature=20, - normalize=True), - loss_cls=dict( # Different from the DINO - type='QualityFocalLoss', - use_sigmoid=True, - beta=2.0, - loss_weight=1.0), - loss_bbox=dict(type='L1Loss', loss_weight=5.0), - loss_iou=dict(type='GIoULoss', loss_weight=2.0)), - rpn_head=dict( - type='RPNHead', - in_channels=256, - feat_channels=256, - anchor_generator=dict( - type='AnchorGenerator', - octave_base_scale=4, - scales_per_octave=3, - ratios=[0.5, 1.0, 2.0], - strides=[4, 8, 16, 32, 64, 128]), - bbox_coder=dict( - type='DeltaXYWHBBoxCoder', - target_means=[.0, .0, .0, .0], - target_stds=[1.0, 1.0, 1.0, 1.0]), - loss_cls=dict( - type='CrossEntropyLoss', - use_sigmoid=True, - loss_weight=1.0 * num_dec_layer * loss_lambda), - loss_bbox=dict( - type='L1Loss', loss_weight=1.0 * num_dec_layer * loss_lambda)), - roi_head=[ - dict( - type='CoStandardRoIHead', - bbox_roi_extractor=dict( - type='SingleRoIExtractor', - roi_layer=dict( - type='RoIAlign', output_size=7, sampling_ratio=0), - out_channels=256, - featmap_strides=[4, 8, 16, 32, 64], - finest_scale=56), - bbox_head=dict( - type='Shared2FCBBoxHead', - in_channels=256, - fc_out_channels=1024, - roi_feat_size=7, - num_classes=num_classes, - bbox_coder=dict( - type='DeltaXYWHBBoxCoder', - target_means=[0., 0., 0., 0.], - target_stds=[0.1, 0.1, 0.2, 0.2]), - reg_class_agnostic=False, - reg_decoded_bbox=True, - loss_cls=dict( - type='CrossEntropyLoss', - use_sigmoid=False, - loss_weight=1.0 * num_dec_layer * loss_lambda), - loss_bbox=dict( - type='GIoULoss', - loss_weight=10.0 * num_dec_layer * loss_lambda))) - ], - bbox_head=[ - dict( - type='CoATSSHead', - num_classes=num_classes, - in_channels=256, - stacked_convs=1, - feat_channels=256, - anchor_generator=dict( - type='AnchorGenerator', - ratios=[1.0], - octave_base_scale=8, - scales_per_octave=1, - strides=[4, 8, 16, 32, 64, 128]), - bbox_coder=dict( - type='DeltaXYWHBBoxCoder', - target_means=[.0, .0, .0, .0], - target_stds=[0.1, 0.1, 0.2, 0.2]), - loss_cls=dict( - type='FocalLoss', - use_sigmoid=True, - gamma=2.0, - alpha=0.25, - loss_weight=1.0 * num_dec_layer * loss_lambda), - loss_bbox=dict( - type='GIoULoss', - loss_weight=2.0 * num_dec_layer * loss_lambda), - loss_centerness=dict( - type='CrossEntropyLoss', - use_sigmoid=True, - loss_weight=1.0 * num_dec_layer * loss_lambda)), - ], - # model training and testing settings - train_cfg=[ - dict( - assigner=dict( - type='HungarianAssigner', - match_costs=[ - dict(type='FocalLossCost', weight=2.0), - dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'), - dict(type='IoUCost', iou_mode='giou', weight=2.0) - ])), - dict( - rpn=dict( - assigner=dict( - type='MaxIoUAssigner', - pos_iou_thr=0.7, - neg_iou_thr=0.3, - min_pos_iou=0.3, - match_low_quality=True, - ignore_iof_thr=-1), - sampler=dict( - type='RandomSampler', - num=256, - pos_fraction=0.5, - neg_pos_ub=-1, - add_gt_as_proposals=False), - allowed_border=-1, - pos_weight=-1, - debug=False), - rpn_proposal=dict( - nms_pre=4000, - max_per_img=1000, - nms=dict(type='nms', iou_threshold=0.7), - min_bbox_size=0), - rcnn=dict( - assigner=dict( - type='MaxIoUAssigner', - pos_iou_thr=0.5, - neg_iou_thr=0.5, - min_pos_iou=0.5, - match_low_quality=False, - ignore_iof_thr=-1), - sampler=dict( - type='RandomSampler', - num=512, - pos_fraction=0.25, - neg_pos_ub=-1, - add_gt_as_proposals=True), - pos_weight=-1, - debug=False)), - dict( - assigner=dict(type='ATSSAssigner', topk=9), - allowed_border=-1, - pos_weight=-1, - debug=False) - ], - test_cfg=[ - # Deferent from the DINO, we use the NMS. - dict( - max_per_img=300, - # NMS can improve the mAP by 0.2. - nms=dict(type='soft_nms', iou_threshold=0.8)), - dict( - rpn=dict( - nms_pre=1000, - max_per_img=1000, - nms=dict(type='nms', iou_threshold=0.7), - min_bbox_size=0), - rcnn=dict( - score_thr=0.0, - nms=dict(type='nms', iou_threshold=0.5), - max_per_img=100)), - dict( - # atss bbox head: - nms_pre=1000, - min_bbox_size=0, - score_thr=0.0, - nms=dict(type='nms', iou_threshold=0.6), - max_per_img=100), - # soft-nms is also supported for rcnn testing - # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05) - ]) - -# LSJ + CopyPaste -load_pipeline = [ - dict(type='LoadImageFromFile'), - dict(type='LoadAnnotations', with_bbox=True, with_mask=True), - dict( - type='RandomResize', - scale=image_size, - ratio_range=(0.1, 2.0), - keep_ratio=True), - dict( - type='RandomCrop', - crop_type='absolute_range', - crop_size=image_size, - recompute_bbox=True, - allow_negative_crop=True), - dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), - dict(type='RandomFlip', prob=0.5), - dict(type='Pad', size=image_size, pad_val=dict(img=(114, 114, 114))), -] - -train_pipeline = [ - dict(type='CopyPaste', max_num_pasted=100), - dict(type='PackDetInputs') -] - -train_dataloader = dict( - sampler=dict(type='DefaultSampler', shuffle=True), - dataset=dict( - pipeline=train_pipeline, - dataset=dict( - filter_cfg=dict(filter_empty_gt=False), pipeline=load_pipeline))) - -# follow ViTDet -test_pipeline = [ - dict(type='LoadImageFromFile'), - dict(type='Resize', scale=image_size, keep_ratio=True), # diff - dict(type='Pad', size=image_size, pad_val=dict(img=(114, 114, 114))), - dict(type='LoadAnnotations', with_bbox=True, with_mask=True), - dict( - type='PackDetInputs', - meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', - 'scale_factor')) -] - -val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) -test_dataloader = val_dataloader - -optim_wrapper = dict( - _delete_=True, - type='OptimWrapper', - optimizer=dict(type='AdamW', lr=2e-4, weight_decay=0.0001), - clip_grad=dict(max_norm=0.1, norm_type=2), - paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.1)})) - -val_evaluator = dict(metric='bbox') -test_evaluator = val_evaluator - -max_epochs = 12 -train_cfg = dict( - _delete_=True, - type='EpochBasedTrainLoop', - max_epochs=max_epochs, - val_interval=1) - -param_scheduler = [ - dict( - type='MultiStepLR', - begin=0, - end=max_epochs, - by_epoch=True, - milestones=[11], - gamma=0.1) -] - -default_hooks = dict( - checkpoint=dict(by_epoch=True, interval=1, max_keep_ckpts=3)) -log_processor = dict(by_epoch=True) - -# NOTE: `auto_scale_lr` is for automatically scaling LR, -# USER SHOULD NOT CHANGE ITS VALUES. -# base_batch_size = (8 GPUs) x (2 samples per GPU) -auto_scale_lr = dict(base_batch_size=16) diff --git a/projects/CO-DETR/configs/codino/co_dino_5scale_r50_lsj_8xb2_3x_coco.py b/projects/CO-DETR/configs/codino/co_dino_5scale_r50_lsj_8xb2_3x_coco.py deleted file mode 100644 index 9a9fc34f6..000000000 --- a/projects/CO-DETR/configs/codino/co_dino_5scale_r50_lsj_8xb2_3x_coco.py +++ /dev/null @@ -1,4 +0,0 @@ -_base_ = ['co_dino_5scale_r50_lsj_8xb2_1x_coco.py'] - -param_scheduler = [dict(milestones=[30])] -train_cfg = dict(max_epochs=36) diff --git a/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_16xb1_16e_o365tococo.py b/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_16xb1_16e_o365tococo.py deleted file mode 100644 index 77821c380..000000000 --- a/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_16xb1_16e_o365tococo.py +++ /dev/null @@ -1,115 +0,0 @@ -_base_ = ['co_dino_5scale_r50_8xb2_1x_coco.py'] - -pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth' # noqa -load_from = 'https://download.openmmlab.com/mmdetection/v3.0/codetr/co_dino_5scale_swin_large_16e_o365tococo-614254c9.pth' # noqa - -# model settings -model = dict( - backbone=dict( - _delete_=True, - type='SwinTransformer', - pretrain_img_size=384, - embed_dims=192, - depths=[2, 2, 18, 2], - num_heads=[6, 12, 24, 48], - window_size=12, - mlp_ratio=4, - qkv_bias=True, - qk_scale=None, - drop_rate=0., - attn_drop_rate=0., - drop_path_rate=0.3, - patch_norm=True, - out_indices=(0, 1, 2, 3), - # Please only add indices that would be used - # in FPN, otherwise some parameter will not be used - with_cp=True, - convert_weights=True, - init_cfg=dict(type='Pretrained', checkpoint=pretrained)), - neck=dict(in_channels=[192, 384, 768, 1536]), - query_head=dict( - dn_cfg=dict(box_noise_scale=0.4, group_cfg=dict(num_dn_queries=500)), - transformer=dict(encoder=dict(with_cp=6)))) - -train_pipeline = [ - dict(type='LoadImageFromFile'), - dict(type='LoadAnnotations', with_bbox=True), - dict(type='RandomFlip', prob=0.5), - dict( - type='RandomChoice', - transforms=[ - [ - dict( - type='RandomChoiceResize', - scales=[(480, 2048), (512, 2048), (544, 2048), (576, 2048), - (608, 2048), (640, 2048), (672, 2048), (704, 2048), - (736, 2048), (768, 2048), (800, 2048), (832, 2048), - (864, 2048), (896, 2048), (928, 2048), (960, 2048), - (992, 2048), (1024, 2048), (1056, 2048), - (1088, 2048), (1120, 2048), (1152, 2048), - (1184, 2048), (1216, 2048), (1248, 2048), - (1280, 2048), (1312, 2048), (1344, 2048), - (1376, 2048), (1408, 2048), (1440, 2048), - (1472, 2048), (1504, 2048), (1536, 2048)], - keep_ratio=True) - ], - [ - dict( - type='RandomChoiceResize', - # The radio of all image in train dataset < 7 - # follow the original implement - scales=[(400, 4200), (500, 4200), (600, 4200)], - keep_ratio=True), - dict( - type='RandomCrop', - crop_type='absolute_range', - crop_size=(384, 600), - allow_negative_crop=True), - dict( - type='RandomChoiceResize', - scales=[(480, 2048), (512, 2048), (544, 2048), (576, 2048), - (608, 2048), (640, 2048), (672, 2048), (704, 2048), - (736, 2048), (768, 2048), (800, 2048), (832, 2048), - (864, 2048), (896, 2048), (928, 2048), (960, 2048), - (992, 2048), (1024, 2048), (1056, 2048), - (1088, 2048), (1120, 2048), (1152, 2048), - (1184, 2048), (1216, 2048), (1248, 2048), - (1280, 2048), (1312, 2048), (1344, 2048), - (1376, 2048), (1408, 2048), (1440, 2048), - (1472, 2048), (1504, 2048), (1536, 2048)], - keep_ratio=True) - ] - ]), - dict(type='PackDetInputs') -] - -train_dataloader = dict( - batch_size=1, num_workers=1, dataset=dict(pipeline=train_pipeline)) - -test_pipeline = [ - dict(type='LoadImageFromFile'), - dict(type='Resize', scale=(2048, 1280), keep_ratio=True), - dict(type='LoadAnnotations', with_bbox=True), - dict( - type='PackDetInputs', - meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', - 'scale_factor')) -] - -val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) -test_dataloader = val_dataloader - -optim_wrapper = dict(optimizer=dict(lr=1e-4)) - -max_epochs = 16 -train_cfg = dict(max_epochs=max_epochs) - -param_scheduler = [ - dict( - type='MultiStepLR', - begin=0, - end=max_epochs, - by_epoch=True, - milestones=[8], - gamma=0.1) -] diff --git a/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_16xb1_1x_coco.py b/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_16xb1_1x_coco.py deleted file mode 100644 index d4a873464..000000000 --- a/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_16xb1_1x_coco.py +++ /dev/null @@ -1,31 +0,0 @@ -_base_ = ['co_dino_5scale_r50_8xb2_1x_coco.py'] - -pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth' # noqa - -# model settings -model = dict( - backbone=dict( - _delete_=True, - type='SwinTransformer', - pretrain_img_size=384, - embed_dims=192, - depths=[2, 2, 18, 2], - num_heads=[6, 12, 24, 48], - window_size=12, - mlp_ratio=4, - qkv_bias=True, - qk_scale=None, - drop_rate=0., - attn_drop_rate=0., - drop_path_rate=0.3, - patch_norm=True, - out_indices=(0, 1, 2, 3), - # Please only add indices that would be used - # in FPN, otherwise some parameter will not be used - with_cp=False, - convert_weights=True, - init_cfg=dict(type='Pretrained', checkpoint=pretrained)), - neck=dict(in_channels=[192, 384, 768, 1536]), - query_head=dict(transformer=dict(encoder=dict(with_cp=6)))) - -train_dataloader = dict(batch_size=1, num_workers=1) diff --git a/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_16xb1_3x_coco.py b/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_16xb1_3x_coco.py deleted file mode 100644 index c2fce29b9..000000000 --- a/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_16xb1_3x_coco.py +++ /dev/null @@ -1,6 +0,0 @@ -_base_ = ['co_dino_5scale_swin_l_16xb1_1x_coco.py'] -# model settings -model = dict(backbone=dict(drop_path_rate=0.6)) - -param_scheduler = [dict(milestones=[30])] -train_cfg = dict(max_epochs=36) diff --git a/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_lsj_16xb1_1x_coco.py b/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_lsj_16xb1_1x_coco.py deleted file mode 100644 index 4a9b3688b..000000000 --- a/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_lsj_16xb1_1x_coco.py +++ /dev/null @@ -1,72 +0,0 @@ -_base_ = ['co_dino_5scale_r50_lsj_8xb2_1x_coco.py'] - -image_size = (1280, 1280) -batch_augments = [ - dict(type='BatchFixedSizePad', size=image_size, pad_mask=True) -] -pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth' # noqa - -# model settings -model = dict( - data_preprocessor=dict(batch_augments=batch_augments), - backbone=dict( - _delete_=True, - type='SwinTransformer', - pretrain_img_size=384, - embed_dims=192, - depths=[2, 2, 18, 2], - num_heads=[6, 12, 24, 48], - window_size=12, - mlp_ratio=4, - qkv_bias=True, - qk_scale=None, - drop_rate=0., - attn_drop_rate=0., - drop_path_rate=0.3, - patch_norm=True, - out_indices=(0, 1, 2, 3), - # Please only add indices that would be used - # in FPN, otherwise some parameter will not be used - with_cp=False, - convert_weights=True, - init_cfg=dict(type='Pretrained', checkpoint=pretrained)), - neck=dict(in_channels=[192, 384, 768, 1536]), - query_head=dict(transformer=dict(encoder=dict(with_cp=6)))) - -load_pipeline = [ - dict(type='LoadImageFromFile'), - dict(type='LoadAnnotations', with_bbox=True, with_mask=True), - dict( - type='RandomResize', - scale=image_size, - ratio_range=(0.1, 2.0), - keep_ratio=True), - dict( - type='RandomCrop', - crop_type='absolute_range', - crop_size=image_size, - recompute_bbox=True, - allow_negative_crop=True), - dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), - dict(type='RandomFlip', prob=0.5), - dict(type='Pad', size=image_size, pad_val=dict(img=(114, 114, 114))), -] - -train_dataloader = dict( - batch_size=1, - num_workers=1, - dataset=dict(dataset=dict(pipeline=load_pipeline))) - -test_pipeline = [ - dict(type='LoadImageFromFile'), - dict(type='Resize', scale=image_size, keep_ratio=True), - dict(type='Pad', size=image_size, pad_val=dict(img=(114, 114, 114))), - dict(type='LoadAnnotations', with_bbox=True, with_mask=True), - dict( - type='PackDetInputs', - meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', - 'scale_factor')) -] - -val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) -test_dataloader = val_dataloader diff --git a/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_lsj_16xb1_3x_coco.py b/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_lsj_16xb1_3x_coco.py deleted file mode 100644 index bf9cd4f43..000000000 --- a/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_lsj_16xb1_3x_coco.py +++ /dev/null @@ -1,7 +0,0 @@ -_base_ = ['co_dino_5scale_swin_l_lsj_16xb1_1x_coco.py'] - -model = dict(backbone=dict(drop_path_rate=0.5)) - -param_scheduler = [dict(type='MultiStepLR', milestones=[30])] - -train_cfg = dict(max_epochs=36) diff --git a/projects/ConvNeXt-V2/README.md b/projects/ConvNeXt-V2/README.md deleted file mode 100644 index 7a9f56cd2..000000000 --- a/projects/ConvNeXt-V2/README.md +++ /dev/null @@ -1,37 +0,0 @@ -# ConvNeXt-V2 - -> [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](http://arxiv.org/abs/2301.00808) - -## Abstract - -Driven by improved architectures and better representation learning frameworks, the field of visual recognition has enjoyed rapid modernization and performance boost in the early 2020s. For example, modern ConvNets, represented by ConvNeXt \[52\], have demonstrated strong performance in various scenarios. While these models were originally designed for supervised learning with ImageNet labels, they can also potentially benefit from self-supervised learning techniques such as masked autoencoders (MAE) . However, we found that simply combining these two approaches leads to subpar performance. In this paper, we propose a fully convolutional masked autoencoder framework and a new Global Response Normalization (GRN) layer that can be added to the ConvNeXt architecture to enhance inter-channel feature competition. This co-design of self-supervised learning techniques and architectural improvement results in a new model family called ConvNeXt V2, which significantly improves the performance of pure ConvNets on various recognition benchmarks, including ImageNet classification, COCO detection, and ADE20K segmentation. We also provide pre-trained ConvNeXt V2 models of various sizes, ranging from an efficient 3.7Mparameter Atto model with 76.7% top-1 accuracy on Im-ageNet, to a 650M Huge model that achieves a state-of-theart 88.9% accuracy using only public training data. - -
- -
- -## Results and models - -| Method | Backbone | Pretrain | Lr schd | Augmentation | Mem (GB) | box AP | mask AP | Config | Download | -| :--------: | :-----------: | :------: | :-----: | :----------: | :------: | :----: | :-----: | :----------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| Mask R-CNN | ConvNeXt-V2-B | FCMAE | 3x | LSJ | 22.5 | 52.9 | 46.4 | [config](./mask-rcnn_convnext-v2-b_fpn_lsj-3x-fcmae_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/convnextv2/mask-rcnn_convnext-v2-b_fpn_lsj-3x-fcmae_coco/mask-rcnn_convnext-v2-b_fpn_lsj-3x-fcmae_coco_20230113_110947-757ee2dd.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/convnextv2/mask-rcnn_convnext-v2-b_fpn_lsj-3x-fcmae_coco/mask-rcnn_convnext-v2-b_fpn_lsj-3x-fcmae_coco_20230113_110947.log.json) | - -**Note**: - -- This is a pre-release version of ConvNeXt-V2 object detection. The official finetuning setting of ConvNeXt-V2 has not been released yet. -- ConvNeXt backbone needs to install [MMPretrain](https://github.com/open-mmlab/mmpretrain/) first, which has abundant backbones for downstream tasks. - -```shell -pip install mmpretrain -``` - -## Citation - -```bibtex -@article{Woo2023ConvNeXtV2, - title={ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders}, - author={Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon and Saining Xie}, - year={2023}, - journal={arXiv preprint arXiv:2301.00808}, -} -``` diff --git a/projects/ConvNeXt-V2/configs/mask-rcnn_convnext-v2-b_fpn_lsj-3x-fcmae_coco.py b/projects/ConvNeXt-V2/configs/mask-rcnn_convnext-v2-b_fpn_lsj-3x-fcmae_coco.py deleted file mode 100644 index 59e895504..000000000 --- a/projects/ConvNeXt-V2/configs/mask-rcnn_convnext-v2-b_fpn_lsj-3x-fcmae_coco.py +++ /dev/null @@ -1,92 +0,0 @@ -_base_ = [ - 'mmdet::_base_/models/mask-rcnn_r50_fpn.py', - 'mmdet::_base_/datasets/coco_instance.py', - 'mmdet::_base_/schedules/schedule_1x.py', - 'mmdet::_base_/default_runtime.py' -] - -# please install the mmpretrain -# import mmpretrain.models to trigger register_module in mmpretrain -custom_imports = dict( - imports=['mmpretrain.models'], allow_failed_imports=False) -checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-base_3rdparty-fcmae_in1k_20230104-8a798eaf.pth' # noqa -image_size = (1024, 1024) - -model = dict( - backbone=dict( - _delete_=True, - type='mmpretrain.ConvNeXt', - arch='base', - out_indices=[0, 1, 2, 3], - # TODO: verify stochastic depth rate {0.1, 0.2, 0.3, 0.4} - drop_path_rate=0.4, - layer_scale_init_value=0., # disable layer scale when using GRN - gap_before_final_norm=False, - use_grn=True, # V2 uses GRN - init_cfg=dict( - type='Pretrained', checkpoint=checkpoint_file, - prefix='backbone.')), - neck=dict(in_channels=[128, 256, 512, 1024]), - test_cfg=dict( - rpn=dict(nms=dict(type='nms')), # TODO: does RPN use soft_nms? - rcnn=dict(nms=dict(type='soft_nms')))) - -train_pipeline = [ - dict(type='LoadImageFromFile', backend_args=_base_.backend_args), - dict(type='LoadAnnotations', with_bbox=True, with_mask=True), - dict( - type='RandomResize', - scale=image_size, - ratio_range=(0.1, 2.0), - keep_ratio=True), - dict( - type='RandomCrop', - crop_type='absolute_range', - crop_size=image_size, - recompute_bbox=True, - allow_negative_crop=True), - dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), - dict(type='RandomFlip', prob=0.5), - dict(type='PackDetInputs') -] - -train_dataloader = dict( - batch_size=4, # total_batch_size 32 = 8 GPUS x 4 images - num_workers=8, - dataset=dict(pipeline=train_pipeline)) - -max_epochs = 36 -train_cfg = dict(max_epochs=max_epochs) - -# learning rate -param_scheduler = [ - dict( - type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, - end=1000), - dict( - type='MultiStepLR', - begin=0, - end=max_epochs, - by_epoch=True, - milestones=[27, 33], - gamma=0.1) -] - -# Enable automatic-mixed-precision training with AmpOptimWrapper. -optim_wrapper = dict( - type='AmpOptimWrapper', - constructor='LearningRateDecayOptimizerConstructor', - paramwise_cfg={ - 'decay_rate': 0.95, - 'decay_type': 'layer_wise', # TODO: sweep layer-wise lr decay? - 'num_layers': 12 - }, - optimizer=dict( - _delete_=True, - type='AdamW', - lr=0.0001, - betas=(0.9, 0.999), - weight_decay=0.05, - )) - -default_hooks = dict(checkpoint=dict(max_keep_ckpts=1)) diff --git a/projects/Detic/README.md b/projects/Detic/README.md deleted file mode 100644 index 98cd705b0..000000000 --- a/projects/Detic/README.md +++ /dev/null @@ -1,156 +0,0 @@ -# Note: This project has been deprecated, please use [Detic_new](../Detic_new). - -# Detecting Twenty-thousand Classes using Image-level Supervision - -## Description - -**Detic**: A **Det**ector with **i**mage **c**lasses that can use image-level labels to easily train detectors. - -

- -> [**Detecting Twenty-thousand Classes using Image-level Supervision**](http://arxiv.org/abs/2201.02605), -> Xingyi Zhou, Rohit Girdhar, Armand Joulin, Philipp Krähenbühl, Ishan Misra, -> *ECCV 2022 ([arXiv 2201.02605](http://arxiv.org/abs/2201.02605))* - -## Usage - - - -## Installation - -Detic requires to install CLIP. - -```shell -pip install git+https://github.com/openai/CLIP.git -``` - -### Demo - -#### Inference with existing dataset vocabulary embeddings - -First, go to the Detic project folder. - -```shell -cd projects/Detic -``` - -Then, download the pre-computed CLIP embeddings from [dataset metainfo](https://github.com/facebookresearch/Detic/tree/main/datasets/metadata) to the `datasets/metadata` folder. -The CLIP embeddings will be loaded to the zero-shot classifier during inference. -For example, you can download LVIS's class name embeddings with the following command: - -```shell -wget -P datasets/metadata https://raw.githubusercontent.com/facebookresearch/Detic/main/datasets/metadata/lvis_v1_clip_a%2Bcname.npy -``` - -You can run demo like this: - -```shell -python demo.py \ - ${IMAGE_PATH} \ - ${CONFIG_PATH} \ - ${MODEL_PATH} \ - --show \ - --score-thr 0.5 \ - --dataset lvis -``` - -![image](https://user-images.githubusercontent.com/12907710/213624759-f0a2ba0c-0f5c-4424-a350-5ba5349e5842.png) - -### Inference with custom vocabularies - -- Detic can detects any class given class names by using CLIP. - -You can detect custom classes with `--class-name` command: - -``` -python demo.py \ - ${IMAGE_PATH} \ - ${CONFIG_PATH} \ - ${MODEL_PATH} \ - --show \ - --score-thr 0.3 \ - --class-name headphone webcam paper coffe -``` - -![image](https://user-images.githubusercontent.com/12907710/213624637-e9e8a313-9821-4782-a18a-4408c876852b.png) - -Note that `headphone`, `paper` and `coffe` (typo intended) are not LVIS classes. Despite the misspelled class name, Detic can produce a reasonable detection for `coffe`. - -## Results - -Here we only provide the Detic Swin-B model for the open vocabulary demo. Multi-dataset training and open-vocabulary testing will be supported in the future. - -To find more variants, please visit the [official model zoo](https://github.com/facebookresearch/Detic/blob/main/docs/MODEL_ZOO.md). - -| Backbone | Training data | Config | Download | -| :------: | :------------------------: | :-------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| Swin-B | ImageNet-21K & LVIS & COCO | [config](./configs/detic_centernet2_swin-b_fpn_4x_lvis-coco-in21k.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_swin-b_fpn_4x_lvis-coco-in21k/detic_centernet2_swin-b_fpn_4x_lvis-coco-in21k_20230120-0d301978.pth) | - -## Citation - -If you find Detic is useful in your research or applications, please consider giving a star 🌟 to the [official repository](https://github.com/facebookresearch/Detic) and citing Detic by the following BibTeX entry. - -```BibTeX -@inproceedings{zhou2022detecting, - title={Detecting Twenty-thousand Classes using Image-level Supervision}, - author={Zhou, Xingyi and Girdhar, Rohit and Joulin, Armand and Kr{\"a}henb{\"u}hl, Philipp and Misra, Ishan}, - booktitle={ECCV}, - year={2022} -} - -``` - -## Checklist - - - -- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`. - - - [x] Finish the code - - - - - [x] Basic docstrings & proper citation - - - - - [x] Test-time correctness - - - - - [x] A full README - - - -- [ ] Milestone 2: Indicates a successful model implementation. - - - [ ] Training-time correctness - - - -- [ ] Milestone 3: Good to be a part of our core package! - - - [ ] Type hints and docstrings - - - - - [ ] Unit tests - - - - - [ ] Code polishing - - - - - [ ] Metafile.yml - - - -- [ ] Move your modules into the core package following the codebase's file hierarchy structure. - - - -- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure. diff --git a/projects/Detic/configs/detic_centernet2_swin-b_fpn_4x_lvis-coco-in21k.py b/projects/Detic/configs/detic_centernet2_swin-b_fpn_4x_lvis-coco-in21k.py deleted file mode 100644 index d554c40ec..000000000 --- a/projects/Detic/configs/detic_centernet2_swin-b_fpn_4x_lvis-coco-in21k.py +++ /dev/null @@ -1,298 +0,0 @@ -_base_ = 'mmdet::common/lsj-200e_coco-detection.py' - -custom_imports = dict( - imports=['projects.Detic.detic'], allow_failed_imports=False) - -image_size = (1024, 1024) -batch_augments = [dict(type='BatchFixedSizePad', size=image_size)] - -cls_layer = dict( - type='ZeroShotClassifier', - zs_weight_path='rand', - zs_weight_dim=512, - use_bias=0.0, - norm_weight=True, - norm_temperature=50.0) -reg_layer = [ - dict(type='Linear', in_features=1024, out_features=1024), - dict(type='ReLU', inplace=True), - dict(type='Linear', in_features=1024, out_features=4) -] - -num_classes = 22047 - -model = dict( - type='CascadeRCNN', - data_preprocessor=dict( - type='DetDataPreprocessor', - mean=[123.675, 116.28, 103.53], - std=[58.395, 57.12, 57.375], - bgr_to_rgb=True, - pad_size_divisor=32, - batch_augments=batch_augments), - backbone=dict( - type='SwinTransformer', - embed_dims=128, - depths=[2, 2, 18, 2], - num_heads=[4, 8, 16, 32], - window_size=7, - mlp_ratio=4, - qkv_bias=True, - qk_scale=None, - drop_rate=0., - attn_drop_rate=0., - drop_path_rate=0.3, - patch_norm=True, - out_indices=(1, 2, 3), - with_cp=False), - neck=dict( - type='FPN', - in_channels=[256, 512, 1024], - out_channels=256, - start_level=0, - add_extra_convs='on_output', - num_outs=5, - init_cfg=dict(type='Caffe2Xavier', layer='Conv2d'), - relu_before_extra_convs=True), - rpn_head=dict( - type='CenterNetRPNHead', - num_classes=1, - in_channels=256, - stacked_convs=4, - feat_channels=256, - strides=[8, 16, 32, 64, 128], - conv_bias=True, - norm_cfg=dict(type='GN', num_groups=32, requires_grad=True), - loss_cls=dict( - type='GaussianFocalLoss', - pos_weight=0.25, - neg_weight=0.75, - loss_weight=1.0), - loss_bbox=dict(type='GIoULoss', loss_weight=2.0), - ), - roi_head=dict( - type='DeticRoIHead', - num_stages=3, - stage_loss_weights=[1, 0.5, 0.25], - bbox_roi_extractor=dict( - type='SingleRoIExtractor', - roi_layer=dict( - type='RoIAlign', - output_size=7, - sampling_ratio=0, - use_torchvision=True), - out_channels=256, - featmap_strides=[8, 16, 32], - # approximately equal to - # canonical_box_size=224, canonical_level=4 in D2 - finest_scale=112), - bbox_head=[ - dict( - type='DeticBBoxHead', - in_channels=256, - fc_out_channels=1024, - roi_feat_size=7, - num_classes=num_classes, - cls_predictor_cfg=cls_layer, - reg_predictor_cfg=reg_layer, - bbox_coder=dict( - type='DeltaXYWHBBoxCoder', - target_means=[0., 0., 0., 0.], - target_stds=[0.1, 0.1, 0.2, 0.2]), - reg_class_agnostic=True, - loss_cls=dict( - type='CrossEntropyLoss', use_sigmoid=True, - loss_weight=1.0), - loss_bbox=dict(type='SmoothL1Loss', beta=1.0, - loss_weight=1.0)), - dict( - type='DeticBBoxHead', - in_channels=256, - fc_out_channels=1024, - roi_feat_size=7, - num_classes=num_classes, - cls_predictor_cfg=cls_layer, - reg_predictor_cfg=reg_layer, - bbox_coder=dict( - type='DeltaXYWHBBoxCoder', - target_means=[0., 0., 0., 0.], - target_stds=[0.05, 0.05, 0.1, 0.1]), - reg_class_agnostic=True, - loss_cls=dict( - type='CrossEntropyLoss', use_sigmoid=True, - loss_weight=1.0), - loss_bbox=dict(type='SmoothL1Loss', beta=1.0, - loss_weight=1.0)), - dict( - type='DeticBBoxHead', - in_channels=256, - fc_out_channels=1024, - roi_feat_size=7, - num_classes=num_classes, - cls_predictor_cfg=cls_layer, - reg_predictor_cfg=reg_layer, - bbox_coder=dict( - type='DeltaXYWHBBoxCoder', - target_means=[0., 0., 0., 0.], - target_stds=[0.033, 0.033, 0.067, 0.067]), - reg_class_agnostic=True, - loss_cls=dict( - type='CrossEntropyLoss', use_sigmoid=True, - loss_weight=1.0), - loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) - ], - mask_roi_extractor=dict( - type='SingleRoIExtractor', - roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), - out_channels=256, - featmap_strides=[8, 16, 32], - # approximately equal to - # canonical_box_size=224, canonical_level=4 in D2 - finest_scale=112), - mask_head=dict( - type='FCNMaskHead', - num_convs=4, - in_channels=256, - conv_out_channels=256, - class_agnostic=True, - num_classes=num_classes, - loss_mask=dict( - type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))), - # model training and testing settings - train_cfg=dict( - rpn=dict( - assigner=dict( - type='MaxIoUAssigner', - pos_iou_thr=0.7, - neg_iou_thr=0.3, - min_pos_iou=0.3, - match_low_quality=True, - ignore_iof_thr=-1), - sampler=dict( - type='RandomSampler', - num=256, - pos_fraction=0.5, - neg_pos_ub=-1, - add_gt_as_proposals=False), - allowed_border=0, - pos_weight=-1, - debug=False), - rpn_proposal=dict( - nms_pre=2000, - max_per_img=2000, - nms=dict(type='nms', iou_threshold=0.7), - min_bbox_size=0), - rcnn=[ - dict( - assigner=dict( - type='MaxIoUAssigner', - pos_iou_thr=0.6, - neg_iou_thr=0.6, - min_pos_iou=0.6, - match_low_quality=False, - ignore_iof_thr=-1), - sampler=dict( - type='RandomSampler', - num=512, - pos_fraction=0.25, - neg_pos_ub=-1, - add_gt_as_proposals=True), - mask_size=28, - pos_weight=-1, - debug=False), - dict( - assigner=dict( - type='MaxIoUAssigner', - pos_iou_thr=0.7, - neg_iou_thr=0.7, - min_pos_iou=0.7, - match_low_quality=False, - ignore_iof_thr=-1), - sampler=dict( - type='RandomSampler', - num=512, - pos_fraction=0.25, - neg_pos_ub=-1, - add_gt_as_proposals=True), - mask_size=28, - pos_weight=-1, - debug=False), - dict( - assigner=dict( - type='MaxIoUAssigner', - pos_iou_thr=0.8, - neg_iou_thr=0.8, - min_pos_iou=0.8, - match_low_quality=False, - ignore_iof_thr=-1), - sampler=dict( - type='RandomSampler', - num=512, - pos_fraction=0.25, - neg_pos_ub=-1, - add_gt_as_proposals=True), - mask_size=28, - pos_weight=-1, - debug=False) - ]), - test_cfg=dict( - rpn=dict( - score_thr=0.0001, - nms_pre=1000, - max_per_img=256, - nms=dict(type='nms', iou_threshold=0.9), - min_bbox_size=0), - rcnn=dict( - score_thr=0.02, - nms=dict(type='nms', iou_threshold=0.5), - max_per_img=300, - mask_thr_binary=0.5))) - -backend = 'pillow' -test_pipeline = [ - dict( - type='LoadImageFromFile', - backend_args=_base_.backend_args, - imdecode_backend=backend), - dict(type='Resize', scale=(1333, 800), keep_ratio=True, backend=backend), - dict( - type='LoadAnnotations', - with_bbox=True, - with_mask=True, - poly2mask=False), - dict( - type='PackDetInputs', - meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', - 'scale_factor')) -] - -train_dataloader = dict(batch_size=8, num_workers=4) -val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) -test_dataloader = val_dataloader -# Enable automatic-mixed-precision training with AmpOptimWrapper. -optim_wrapper = dict( - type='AmpOptimWrapper', - optimizer=dict( - type='SGD', lr=0.01 * 4, momentum=0.9, weight_decay=0.00004), - paramwise_cfg=dict(norm_decay_mult=0.)) - -param_scheduler = [ - dict( - type='LinearLR', - start_factor=0.00025, - by_epoch=False, - begin=0, - end=4000), - dict( - type='MultiStepLR', - begin=0, - end=25, - by_epoch=True, - milestones=[22, 24], - gamma=0.1) -] - -# NOTE: `auto_scale_lr` is for automatically scaling LR, -# USER SHOULD NOT CHANGE ITS VALUES. -# base_batch_size = (8 GPUs) x (8 samples per GPU) -auto_scale_lr = dict(base_batch_size=64) diff --git a/projects/Detic/demo.py b/projects/Detic/demo.py deleted file mode 100644 index d5c80c9aa..000000000 --- a/projects/Detic/demo.py +++ /dev/null @@ -1,142 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import os -import urllib -from argparse import ArgumentParser - -import mmcv -import torch -from mmengine.logging import print_log -from mmengine.utils import ProgressBar, scandir - -from mmdet.apis import inference_detector, init_detector -from mmdet.registry import VISUALIZERS -from mmdet.utils import register_all_modules - -IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', - '.tiff', '.webp') - - -def get_file_list(source_root: str) -> [list, dict]: - """Get file list. - - Args: - source_root (str): image or video source path - - Return: - source_file_path_list (list): A list for all source file. - source_type (dict): Source type: file or url or dir. - """ - is_dir = os.path.isdir(source_root) - is_url = source_root.startswith(('http:/', 'https:/')) - is_file = os.path.splitext(source_root)[-1].lower() in IMG_EXTENSIONS - - source_file_path_list = [] - if is_dir: - # when input source is dir - for file in scandir(source_root, IMG_EXTENSIONS, recursive=True): - source_file_path_list.append(os.path.join(source_root, file)) - elif is_url: - # when input source is url - filename = os.path.basename( - urllib.parse.unquote(source_root).split('?')[0]) - file_save_path = os.path.join(os.getcwd(), filename) - print(f'Downloading source file to {file_save_path}') - torch.hub.download_url_to_file(source_root, file_save_path) - source_file_path_list = [file_save_path] - elif is_file: - # when input source is single image - source_file_path_list = [source_root] - else: - print('Cannot find image file.') - - source_type = dict(is_dir=is_dir, is_url=is_url, is_file=is_file) - - return source_file_path_list, source_type - - -def parse_args(): - parser = ArgumentParser() - parser.add_argument( - 'img', help='Image path, include image file, dir and URL.') - parser.add_argument('config', help='Config file') - parser.add_argument('checkpoint', help='Checkpoint file') - parser.add_argument( - '--out-dir', default='./output', help='Path to output file') - parser.add_argument( - '--device', default='cuda:0', help='Device used for inference') - parser.add_argument( - '--show', action='store_true', help='Show the detection results') - parser.add_argument( - '--score-thr', type=float, default=0.3, help='Bbox score threshold') - parser.add_argument( - '--dataset', type=str, help='dataset name to load the text embedding') - parser.add_argument( - '--class-name', nargs='+', type=str, help='custom class names') - args = parser.parse_args() - return args - - -def main(): - args = parse_args() - - # register all modules in mmdet into the registries - register_all_modules() - - # build the model from a config file and a checkpoint file - model = init_detector(args.config, args.checkpoint, device=args.device) - - if not os.path.exists(args.out_dir) and not args.show: - os.mkdir(args.out_dir) - - # init visualizer - visualizer = VISUALIZERS.build(model.cfg.visualizer) - visualizer.dataset_meta = model.dataset_meta - - # get file list - files, source_type = get_file_list(args.img) - from detic.utils import (get_class_names, get_text_embeddings, - reset_cls_layer_weight) - - # class name embeddings - if args.class_name: - dataset_classes = args.class_name - elif args.dataset: - dataset_classes = get_class_names(args.dataset) - embedding = get_text_embeddings( - dataset=args.dataset, custom_vocabulary=args.class_name) - visualizer.dataset_meta['classes'] = dataset_classes - reset_cls_layer_weight(model, embedding) - - # start detector inference - progress_bar = ProgressBar(len(files)) - for file in files: - result = inference_detector(model, file) - - img = mmcv.imread(file) - img = mmcv.imconvert(img, 'bgr', 'rgb') - - if source_type['is_dir']: - filename = os.path.relpath(file, args.img).replace('/', '_') - else: - filename = os.path.basename(file) - out_file = None if args.show else os.path.join(args.out_dir, filename) - - progress_bar.update() - - visualizer.add_datasample( - filename, - img, - data_sample=result, - draw_gt=False, - show=args.show, - wait_time=0, - out_file=out_file, - pred_score_thr=args.score_thr) - - if not args.show: - print_log( - f'\nResults have been saved at {os.path.abspath(args.out_dir)}') - - -if __name__ == '__main__': - main() diff --git a/projects/Detic/detic/__init__.py b/projects/Detic/detic/__init__.py deleted file mode 100644 index d0ad07025..000000000 --- a/projects/Detic/detic/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from .centernet_rpn_head import CenterNetRPNHead -from .detic_bbox_head import DeticBBoxHead -from .detic_roi_head import DeticRoIHead -from .zero_shot_classifier import ZeroShotClassifier - -__all__ = [ - 'CenterNetRPNHead', 'DeticBBoxHead', 'DeticRoIHead', 'ZeroShotClassifier' -] diff --git a/projects/Detic/detic/centernet_rpn_head.py b/projects/Detic/detic/centernet_rpn_head.py deleted file mode 100644 index 765d6dfb2..000000000 --- a/projects/Detic/detic/centernet_rpn_head.py +++ /dev/null @@ -1,196 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import copy -from typing import List, Sequence, Tuple - -import torch -import torch.nn as nn -from mmcv.cnn import Scale -from mmengine import ConfigDict -from mmengine.structures import InstanceData -from torch import Tensor - -from mmdet.models.dense_heads import CenterNetUpdateHead -from mmdet.models.utils import multi_apply -from mmdet.registry import MODELS - -INF = 1000000000 -RangeType = Sequence[Tuple[int, int]] - - -@MODELS.register_module(force=True) # avoid bug -class CenterNetRPNHead(CenterNetUpdateHead): - """CenterNetUpdateHead is an improved version of CenterNet in CenterNet2. - - Paper link ``_. - """ - - def _init_layers(self) -> None: - """Initialize layers of the head.""" - self._init_reg_convs() - self._init_predictor() - - def _init_predictor(self) -> None: - """Initialize predictor layers of the head.""" - self.conv_cls = nn.Conv2d( - self.feat_channels, self.num_classes, 3, padding=1) - self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1) - - def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor], List[Tensor]]: - """Forward features from the upstream network. - - Args: - x (tuple[Tensor]): Features from the upstream network, each is - a 4D-tensor. - - Returns: - tuple: A tuple of each level outputs. - - - cls_scores (list[Tensor]): Box scores for each scale level, \ - each is a 4D-tensor, the channel number is num_classes. - - bbox_preds (list[Tensor]): Box energies / deltas for each \ - scale level, each is a 4D-tensor, the channel number is 4. - """ - res = multi_apply(self.forward_single, x, self.scales, self.strides) - return res - - def forward_single(self, x: Tensor, scale: Scale, - stride: int) -> Tuple[Tensor, Tensor]: - """Forward features of a single scale level. - - Args: - x (Tensor): FPN feature maps of the specified stride. - scale (:obj:`mmcv.cnn.Scale`): Learnable scale module to resize - the bbox prediction. - stride (int): The corresponding stride for feature maps. - - Returns: - tuple: scores for each class, bbox predictions of - input feature maps. - """ - for m in self.reg_convs: - x = m(x) - cls_score = self.conv_cls(x) - bbox_pred = self.conv_reg(x) - # scale the bbox_pred of different level - # float to avoid overflow when enabling FP16 - bbox_pred = scale(bbox_pred).float() - # bbox_pred needed for gradient computation has been modified - # by F.relu(bbox_pred) when run with PyTorch 1.10. So replace - # F.relu(bbox_pred) with bbox_pred.clamp(min=0) - bbox_pred = bbox_pred.clamp(min=0) - if not self.training: - bbox_pred *= stride - return cls_score, bbox_pred # score aligned, box larger - - def _predict_by_feat_single(self, - cls_score_list: List[Tensor], - bbox_pred_list: List[Tensor], - score_factor_list: List[Tensor], - mlvl_priors: List[Tensor], - img_meta: dict, - cfg: ConfigDict, - rescale: bool = False, - with_nms: bool = True) -> InstanceData: - """Transform a single image's features extracted from the head into - bbox results. - - Args: - cls_score_list (list[Tensor]): Box scores from all scale - levels of a single image, each item has shape - (num_priors * num_classes, H, W). - bbox_pred_list (list[Tensor]): Box energies / deltas from - all scale levels of a single image, each item has shape - (num_priors * 4, H, W). - score_factor_list (list[Tensor]): Score factor from all scale - levels of a single image, each item has shape - (num_priors * 1, H, W). - mlvl_priors (list[Tensor]): Each element in the list is - the priors of a single level in feature pyramid. In all - anchor-based methods, it has shape (num_priors, 4). In - all anchor-free methods, it has shape (num_priors, 2) - when `with_stride=True`, otherwise it still has shape - (num_priors, 4). - img_meta (dict): Image meta info. - cfg (mmengine.Config): Test / postprocessing configuration, - if None, test_cfg would be used. - rescale (bool): If True, return boxes in original image space. - Defaults to False. - with_nms (bool): If True, do nms before return boxes. - Defaults to True. - - Returns: - :obj:`InstanceData`: Detection results of each image - after the post process. - Each item usually contains following keys. - - - scores (Tensor): Classification scores, has a shape - (num_instance, ) - - labels (Tensor): Labels of bboxes, has a shape - (num_instances, ). - - bboxes (Tensor): Has a shape (num_instances, 4), - the last dimension 4 arrange as (x1, y1, x2, y2). - """ - - cfg = self.test_cfg if cfg is None else cfg - cfg = copy.deepcopy(cfg) - nms_pre = cfg.get('nms_pre', -1) - - mlvl_bbox_preds = [] - mlvl_valid_priors = [] - mlvl_scores = [] - mlvl_labels = [] - - for level_idx, (cls_score, bbox_pred, score_factor, priors) in \ - enumerate(zip(cls_score_list, bbox_pred_list, - score_factor_list, mlvl_priors)): - - assert cls_score.size()[-2:] == bbox_pred.size()[-2:] - - dim = self.bbox_coder.encode_size - bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, dim) - cls_score = cls_score.permute(1, 2, - 0).reshape(-1, self.cls_out_channels) - heatmap = cls_score.sigmoid() - score_thr = cfg.get('score_thr', 0) - - candidate_inds = heatmap > score_thr # 0.05 - pre_nms_top_n = candidate_inds.sum() # N - pre_nms_top_n = pre_nms_top_n.clamp(max=nms_pre) # N - - heatmap = heatmap[candidate_inds] # n - - candidate_nonzeros = candidate_inds.nonzero() # n - box_loc = candidate_nonzeros[:, 0] # n - labels = candidate_nonzeros[:, 1] # n - - bbox_pred = bbox_pred[box_loc] # n x 4 - per_grids = priors[box_loc] # n x 2 - - if candidate_inds.sum().item() > pre_nms_top_n.item(): - heatmap, top_k_indices = \ - heatmap.topk(pre_nms_top_n, sorted=False) - labels = labels[top_k_indices] - bbox_pred = bbox_pred[top_k_indices] - per_grids = per_grids[top_k_indices] - - bboxes = self.bbox_coder.decode(per_grids, bbox_pred) - # avoid invalid boxes in RoI heads - bboxes[:, 2] = torch.max(bboxes[:, 2], bboxes[:, 0] + 0.01) - bboxes[:, 3] = torch.max(bboxes[:, 3], bboxes[:, 1] + 0.01) - - mlvl_bbox_preds.append(bboxes) - mlvl_valid_priors.append(priors) - mlvl_scores.append(torch.sqrt(heatmap)) - mlvl_labels.append(labels) - - results = InstanceData() - results.bboxes = torch.cat(mlvl_bbox_preds) - results.scores = torch.cat(mlvl_scores) - results.labels = torch.cat(mlvl_labels) - - return self._bbox_post_process( - results=results, - cfg=cfg, - rescale=rescale, - with_nms=with_nms, - img_meta=img_meta) diff --git a/projects/Detic/detic/detic_bbox_head.py b/projects/Detic/detic/detic_bbox_head.py deleted file mode 100644 index 9408cbe04..000000000 --- a/projects/Detic/detic/detic_bbox_head.py +++ /dev/null @@ -1,112 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from typing import Optional, Union - -from mmengine.config import ConfigDict -from mmengine.structures import InstanceData -from torch import Tensor - -from mmdet.models.layers import multiclass_nms -from mmdet.models.roi_heads.bbox_heads import Shared2FCBBoxHead -from mmdet.models.utils import empty_instances -from mmdet.registry import MODELS -from mmdet.structures.bbox import get_box_tensor, scale_boxes - - -@MODELS.register_module(force=True) # avoid bug -class DeticBBoxHead(Shared2FCBBoxHead): - - def __init__(self, - *args, - init_cfg: Optional[Union[dict, ConfigDict]] = None, - **kwargs) -> None: - super().__init__(*args, init_cfg=init_cfg, **kwargs) - # reconstruct fc_cls and fc_reg since input channels are changed - assert self.with_cls - cls_channels = self.num_classes - cls_predictor_cfg_ = self.cls_predictor_cfg.copy() - cls_predictor_cfg_.update( - in_features=self.cls_last_dim, out_features=cls_channels) - self.fc_cls = MODELS.build(cls_predictor_cfg_) - - def _predict_by_feat_single( - self, - roi: Tensor, - cls_score: Tensor, - bbox_pred: Tensor, - img_meta: dict, - rescale: bool = False, - rcnn_test_cfg: Optional[ConfigDict] = None) -> InstanceData: - """Transform a single image's features extracted from the head into - bbox results. - - Args: - roi (Tensor): Boxes to be transformed. Has shape (num_boxes, 5). - last dimension 5 arrange as (batch_index, x1, y1, x2, y2). - cls_score (Tensor): Box scores, has shape - (num_boxes, num_classes + 1). - bbox_pred (Tensor): Box energies / deltas. - has shape (num_boxes, num_classes * 4). - img_meta (dict): image information. - rescale (bool): If True, return boxes in original image space. - Defaults to False. - rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of Bbox Head. - Defaults to None - - Returns: - :obj:`InstanceData`: Detection results of each image\ - Each item usually contains following keys. - - - scores (Tensor): Classification scores, has a shape - (num_instance, ) - - labels (Tensor): Labels of bboxes, has a shape - (num_instances, ). - - bboxes (Tensor): Has a shape (num_instances, 4), - the last dimension 4 arrange as (x1, y1, x2, y2). - """ - results = InstanceData() - if roi.shape[0] == 0: - return empty_instances([img_meta], - roi.device, - task_type='bbox', - instance_results=[results], - box_type=self.predict_box_type, - use_box_type=False, - num_classes=self.num_classes, - score_per_cls=rcnn_test_cfg is None)[0] - scores = cls_score - img_shape = img_meta['img_shape'] - num_rois = roi.size(0) - - num_classes = 1 if self.reg_class_agnostic else self.num_classes - roi = roi.repeat_interleave(num_classes, dim=0) - bbox_pred = bbox_pred.view(-1, self.bbox_coder.encode_size) - bboxes = self.bbox_coder.decode( - roi[..., 1:], bbox_pred, max_shape=img_shape) - - if rescale and bboxes.size(0) > 0: - assert img_meta.get('scale_factor') is not None - scale_factor = [1 / s for s in img_meta['scale_factor']] - bboxes = scale_boxes(bboxes, scale_factor) - - # Get the inside tensor when `bboxes` is a box type - bboxes = get_box_tensor(bboxes) - box_dim = bboxes.size(-1) - bboxes = bboxes.view(num_rois, -1) - - if rcnn_test_cfg is None: - # This means that it is aug test. - # It needs to return the raw results without nms. - results.bboxes = bboxes - results.scores = scores - else: - det_bboxes, det_labels = multiclass_nms( - bboxes, - scores, - rcnn_test_cfg.score_thr, - rcnn_test_cfg.nms, - rcnn_test_cfg.max_per_img, - box_dim=box_dim) - results.bboxes = det_bboxes[:, :-1] - results.scores = det_bboxes[:, -1] - results.labels = det_labels - return results diff --git a/projects/Detic/detic/detic_roi_head.py b/projects/Detic/detic/detic_roi_head.py deleted file mode 100644 index a09c11c6e..000000000 --- a/projects/Detic/detic/detic_roi_head.py +++ /dev/null @@ -1,326 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from typing import List, Sequence, Tuple - -import torch -from mmengine.structures import InstanceData -from torch import Tensor - -from mmdet.models.roi_heads import CascadeRoIHead -from mmdet.models.task_modules.samplers import SamplingResult -from mmdet.models.test_time_augs import merge_aug_masks -from mmdet.models.utils.misc import empty_instances -from mmdet.registry import MODELS -from mmdet.structures import SampleList -from mmdet.structures.bbox import bbox2roi, get_box_tensor -from mmdet.utils import ConfigType, InstanceList, MultiConfig - - -@MODELS.register_module(force=True) # avoid bug -class DeticRoIHead(CascadeRoIHead): - - def init_mask_head(self, mask_roi_extractor: MultiConfig, - mask_head: MultiConfig) -> None: - """Initialize mask head and mask roi extractor. - - Args: - mask_head (dict): Config of mask in mask head. - mask_roi_extractor (:obj:`ConfigDict`, dict or list): - Config of mask roi extractor. - """ - self.mask_head = MODELS.build(mask_head) - - if mask_roi_extractor is not None: - self.share_roi_extractor = False - self.mask_roi_extractor = MODELS.build(mask_roi_extractor) - else: - self.share_roi_extractor = True - self.mask_roi_extractor = self.bbox_roi_extractor - - def _refine_roi(self, x: Tuple[Tensor], rois: Tensor, - batch_img_metas: List[dict], - num_proposals_per_img: Sequence[int], **kwargs) -> tuple: - """Multi-stage refinement of RoI. - - Args: - x (tuple[Tensor]): List of multi-level img features. - rois (Tensor): shape (n, 5), [batch_ind, x1, y1, x2, y2] - batch_img_metas (list[dict]): List of image information. - num_proposals_per_img (sequence[int]): number of proposals - in each image. - - Returns: - tuple: - - - rois (Tensor): Refined RoI. - - cls_scores (list[Tensor]): Average predicted - cls score per image. - - bbox_preds (list[Tensor]): Bbox branch predictions - for the last stage of per image. - """ - # "ms" in variable names means multi-stage - ms_scores = [] - for stage in range(self.num_stages): - bbox_results = self._bbox_forward( - stage=stage, x=x, rois=rois, **kwargs) - - # split batch bbox prediction back to each image - cls_scores = bbox_results['cls_score'].sigmoid() - bbox_preds = bbox_results['bbox_pred'] - - rois = rois.split(num_proposals_per_img, 0) - cls_scores = cls_scores.split(num_proposals_per_img, 0) - ms_scores.append(cls_scores) - bbox_preds = bbox_preds.split(num_proposals_per_img, 0) - - if stage < self.num_stages - 1: - bbox_head = self.bbox_head[stage] - refine_rois_list = [] - for i in range(len(batch_img_metas)): - if rois[i].shape[0] > 0: - bbox_label = cls_scores[i][:, :-1].argmax(dim=1) - # Refactor `bbox_head.regress_by_class` to only accept - # box tensor without img_idx concatenated. - refined_bboxes = bbox_head.regress_by_class( - rois[i][:, 1:], bbox_label, bbox_preds[i], - batch_img_metas[i]) - refined_bboxes = get_box_tensor(refined_bboxes) - refined_rois = torch.cat( - [rois[i][:, [0]], refined_bboxes], dim=1) - refine_rois_list.append(refined_rois) - rois = torch.cat(refine_rois_list) - # ms_scores aligned - # average scores of each image by stages - cls_scores = [ - sum([score[i] for score in ms_scores]) / float(len(ms_scores)) - for i in range(len(batch_img_metas)) - ] # aligned - return rois, cls_scores, bbox_preds - - def _bbox_forward(self, stage: int, x: Tuple[Tensor], - rois: Tensor) -> dict: - """Box head forward function used in both training and testing. - - Args: - stage (int): The current stage in Cascade RoI Head. - x (tuple[Tensor]): List of multi-level img features. - rois (Tensor): RoIs with the shape (n, 5) where the first - column indicates batch id of each RoI. - - Returns: - dict[str, Tensor]: Usually returns a dictionary with keys: - - - `cls_score` (Tensor): Classification scores. - - `bbox_pred` (Tensor): Box energies / deltas. - - `bbox_feats` (Tensor): Extract bbox RoI features. - """ - bbox_roi_extractor = self.bbox_roi_extractor[stage] - bbox_head = self.bbox_head[stage] - bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs], - rois) - # do not support caffe_c4 model anymore - cls_score, bbox_pred = bbox_head(bbox_feats) - - bbox_results = dict( - cls_score=cls_score, bbox_pred=bbox_pred, bbox_feats=bbox_feats) - return bbox_results - - def predict_bbox(self, - x: Tuple[Tensor], - batch_img_metas: List[dict], - rpn_results_list: InstanceList, - rcnn_test_cfg: ConfigType, - rescale: bool = False, - **kwargs) -> InstanceList: - """Perform forward propagation of the bbox head and predict detection - results on the features of the upstream network. - - Args: - x (tuple[Tensor]): Feature maps of all scale level. - batch_img_metas (list[dict]): List of image information. - rpn_results_list (list[:obj:`InstanceData`]): List of region - proposals. - rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of R-CNN. - rescale (bool): If True, return boxes in original image space. - Defaults to False. - - Returns: - list[:obj:`InstanceData`]: Detection results of each image - after the post process. - Each item usually contains following keys. - - - scores (Tensor): Classification scores, has a shape - (num_instance, ) - - labels (Tensor): Labels of bboxes, has a shape - (num_instances, ). - - bboxes (Tensor): Has a shape (num_instances, 4), - the last dimension 4 arrange as (x1, y1, x2, y2). - """ - proposals = [res.bboxes for res in rpn_results_list] - proposal_scores = [res.scores for res in rpn_results_list] - num_proposals_per_img = tuple(len(p) for p in proposals) - rois = bbox2roi(proposals) - - if rois.shape[0] == 0: - return empty_instances( - batch_img_metas, - rois.device, - task_type='bbox', - box_type=self.bbox_head[-1].predict_box_type, - num_classes=self.bbox_head[-1].num_classes, - score_per_cls=rcnn_test_cfg is None) - # rois aligned - rois, cls_scores, bbox_preds = self._refine_roi( - x=x, - rois=rois, - batch_img_metas=batch_img_metas, - num_proposals_per_img=num_proposals_per_img, - **kwargs) - - # score reweighting in centernet2 - cls_scores = [(s * ps[:, None])**0.5 - for s, ps in zip(cls_scores, proposal_scores)] - cls_scores = [ - s * (s == s[:, :-1].max(dim=1)[0][:, None]).float() - for s in cls_scores - ] - - # fast_rcnn_inference - results_list = self.bbox_head[-1].predict_by_feat( - rois=rois, - cls_scores=cls_scores, - bbox_preds=bbox_preds, - batch_img_metas=batch_img_metas, - rescale=rescale, - rcnn_test_cfg=rcnn_test_cfg) - return results_list - - def _mask_forward(self, x: Tuple[Tensor], rois: Tensor) -> dict: - """Mask head forward function used in both training and testing. - - Args: - stage (int): The current stage in Cascade RoI Head. - x (tuple[Tensor]): Tuple of multi-level img features. - rois (Tensor): RoIs with the shape (n, 5) where the first - column indicates batch id of each RoI. - - Returns: - dict: Usually returns a dictionary with keys: - - - `mask_preds` (Tensor): Mask prediction. - """ - mask_feats = self.mask_roi_extractor( - x[:self.mask_roi_extractor.num_inputs], rois) - # do not support caffe_c4 model anymore - mask_preds = self.mask_head(mask_feats) - - mask_results = dict(mask_preds=mask_preds) - return mask_results - - def mask_loss(self, x, sampling_results: List[SamplingResult], - batch_gt_instances: InstanceList) -> dict: - """Run forward function and calculate loss for mask head in training. - - Args: - x (tuple[Tensor]): Tuple of multi-level img features. - sampling_results (list["obj:`SamplingResult`]): Sampling results. - batch_gt_instances (list[:obj:`InstanceData`]): Batch of - gt_instance. It usually includes ``bboxes``, ``labels``, and - ``masks`` attributes. - - Returns: - dict: Usually returns a dictionary with keys: - - - `mask_preds` (Tensor): Mask prediction. - - `loss_mask` (dict): A dictionary of mask loss components. - """ - pos_rois = bbox2roi([res.pos_priors for res in sampling_results]) - mask_results = self._mask_forward(x, pos_rois) - - mask_loss_and_target = self.mask_head.loss_and_target( - mask_preds=mask_results['mask_preds'], - sampling_results=sampling_results, - batch_gt_instances=batch_gt_instances, - rcnn_train_cfg=self.train_cfg[-1]) - mask_results.update(mask_loss_and_target) - - return mask_results - - def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList, - batch_data_samples: SampleList) -> dict: - """Perform forward propagation and loss calculation of the detection - roi on the features of the upstream network. - - Args: - x (tuple[Tensor]): List of multi-level img features. - rpn_results_list (list[:obj:`InstanceData`]): List of region - proposals. - batch_data_samples (list[:obj:`DetDataSample`]): The batch - data samples. It usually includes information such - as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. - - Returns: - dict[str, Tensor]: A dictionary of loss components - """ - raise NotImplementedError - - def predict_mask(self, - x: Tuple[Tensor], - batch_img_metas: List[dict], - results_list: List[InstanceData], - rescale: bool = False) -> List[InstanceData]: - """Perform forward propagation of the mask head and predict detection - results on the features of the upstream network. - - Args: - x (tuple[Tensor]): Feature maps of all scale level. - batch_img_metas (list[dict]): List of image information. - results_list (list[:obj:`InstanceData`]): Detection results of - each image. - rescale (bool): If True, return boxes in original image space. - Defaults to False. - - Returns: - list[:obj:`InstanceData`]: Detection results of each image - after the post process. - Each item usually contains following keys. - - - scores (Tensor): Classification scores, has a shape - (num_instance, ) - - labels (Tensor): Labels of bboxes, has a shape - (num_instances, ). - - bboxes (Tensor): Has a shape (num_instances, 4), - the last dimension 4 arrange as (x1, y1, x2, y2). - - masks (Tensor): Has a shape (num_instances, H, W). - """ - bboxes = [res.bboxes for res in results_list] - mask_rois = bbox2roi(bboxes) - if mask_rois.shape[0] == 0: - results_list = empty_instances( - batch_img_metas, - mask_rois.device, - task_type='mask', - instance_results=results_list, - mask_thr_binary=self.test_cfg.mask_thr_binary) - return results_list - - num_mask_rois_per_img = [len(res) for res in results_list] - aug_masks = [] - mask_results = self._mask_forward(x, mask_rois) - mask_preds = mask_results['mask_preds'] - # split batch mask prediction back to each image - mask_preds = mask_preds.split(num_mask_rois_per_img, 0) - aug_masks.append([m.sigmoid().detach() for m in mask_preds]) - - merged_masks = [] - for i in range(len(batch_img_metas)): - aug_mask = [mask[i] for mask in aug_masks] - merged_mask = merge_aug_masks(aug_mask, batch_img_metas[i]) - merged_masks.append(merged_mask) - results_list = self.mask_head.predict_by_feat( - mask_preds=merged_masks, - results_list=results_list, - batch_img_metas=batch_img_metas, - rcnn_test_cfg=self.test_cfg, - rescale=rescale, - activate_map=True) - return results_list diff --git a/projects/Detic/detic/text_encoder.py b/projects/Detic/detic/text_encoder.py deleted file mode 100644 index f0024efaf..000000000 --- a/projects/Detic/detic/text_encoder.py +++ /dev/null @@ -1,50 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from typing import List, Union - -import torch -import torch.nn as nn - - -class CLIPTextEncoder(nn.Module): - - def __init__(self, model_name='ViT-B/32'): - super().__init__() - import clip - from clip.simple_tokenizer import SimpleTokenizer - self.tokenizer = SimpleTokenizer() - pretrained_model, _ = clip.load(model_name, device='cpu') - self.clip = pretrained_model - - @property - def device(self): - return self.clip.device - - @property - def dtype(self): - return self.clip.dtype - - def tokenize(self, - texts: Union[str, List[str]], - context_length: int = 77) -> torch.LongTensor: - if isinstance(texts, str): - texts = [texts] - - sot_token = self.tokenizer.encoder['<|startoftext|>'] - eot_token = self.tokenizer.encoder['<|endoftext|>'] - all_tokens = [[sot_token] + self.tokenizer.encode(text) + [eot_token] - for text in texts] - result = torch.zeros(len(all_tokens), context_length, dtype=torch.long) - - for i, tokens in enumerate(all_tokens): - if len(tokens) > context_length: - st = torch.randint(len(tokens) - context_length + 1, - (1, ))[0].item() - tokens = tokens[st:st + context_length] - result[i, :len(tokens)] = torch.tensor(tokens) - - return result - - def forward(self, text): - text = self.tokenize(text) - text_features = self.clip.encode_text(text) - return text_features diff --git a/projects/Detic/detic/utils.py b/projects/Detic/detic/utils.py deleted file mode 100644 index 56d4fd429..000000000 --- a/projects/Detic/detic/utils.py +++ /dev/null @@ -1,78 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import numpy as np -import torch -import torch.nn.functional as F -from mmengine.logging import print_log - -from .text_encoder import CLIPTextEncoder - -# download from -# https://github.com/facebookresearch/Detic/tree/main/datasets/metadata -DATASET_EMBEDDINGS = { - 'lvis': 'datasets/metadata/lvis_v1_clip_a+cname.npy', - 'objects365': 'datasets/metadata/o365_clip_a+cnamefix.npy', - 'openimages': 'datasets/metadata/oid_clip_a+cname.npy', - 'coco': 'datasets/metadata/coco_clip_a+cname.npy', -} - - -def get_text_embeddings(dataset=None, - custom_vocabulary=None, - prompt_prefix='a '): - assert (dataset is None) ^ (custom_vocabulary is None), \ - 'Either `dataset` or `custom_vocabulary` should be specified.' - if dataset: - if dataset in DATASET_EMBEDDINGS: - return DATASET_EMBEDDINGS[dataset] - else: - custom_vocabulary = get_class_names(dataset) - - text_encoder = CLIPTextEncoder() - text_encoder.eval() - texts = [prompt_prefix + x for x in custom_vocabulary] - print_log( - f'Computing text embeddings for {len(custom_vocabulary)} classes.') - embeddings = text_encoder(texts).detach().permute(1, 0).contiguous().cpu() - return embeddings - - -def get_class_names(dataset): - if dataset == 'coco': - from mmdet.datasets import CocoDataset - class_names = CocoDataset.METAINFO['classes'] - elif dataset == 'cityscapes': - from mmdet.datasets import CityscapesDataset - class_names = CityscapesDataset.METAINFO['classes'] - elif dataset == 'voc': - from mmdet.datasets import VOCDataset - class_names = VOCDataset.METAINFO['classes'] - elif dataset == 'openimages': - from mmdet.datasets import OpenImagesDataset - class_names = OpenImagesDataset.METAINFO['classes'] - elif dataset == 'lvis': - from mmdet.datasets import LVISV1Dataset - class_names = LVISV1Dataset.METAINFO['classes'] - else: - raise TypeError(f'Invalid type for dataset name: {type(dataset)}') - return class_names - - -def reset_cls_layer_weight(model, weight): - if type(weight) == str: - print_log(f'Resetting cls_layer_weight from file: {weight}') - zs_weight = torch.tensor( - np.load(weight), - dtype=torch.float32).permute(1, 0).contiguous() # D x C - else: - zs_weight = weight - zs_weight = torch.cat( - [zs_weight, zs_weight.new_zeros( - (zs_weight.shape[0], 1))], dim=1) # D x (C + 1) - zs_weight = F.normalize(zs_weight, p=2, dim=0) - zs_weight = zs_weight.to('cuda') - num_classes = zs_weight.shape[-1] - - for bbox_head in model.roi_head.bbox_head: - bbox_head.num_classes = num_classes - del bbox_head.fc_cls.zs_weight - bbox_head.fc_cls.zs_weight = zs_weight diff --git a/projects/Detic/detic/zero_shot_classifier.py b/projects/Detic/detic/zero_shot_classifier.py deleted file mode 100644 index 35c9e4928..000000000 --- a/projects/Detic/detic/zero_shot_classifier.py +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -import numpy as np -import torch -from torch import nn -from torch.nn import functional as F - -from mmdet.registry import MODELS - - -@MODELS.register_module(force=True) # avoid bug -class ZeroShotClassifier(nn.Module): - - def __init__( - self, - in_features: int, - out_features: int, # num_classes - zs_weight_path: str, - zs_weight_dim: int = 512, - use_bias: float = 0.0, - norm_weight: bool = True, - norm_temperature: float = 50.0, - ): - super().__init__() - num_classes = out_features - self.norm_weight = norm_weight - self.norm_temperature = norm_temperature - - self.use_bias = use_bias < 0 - if self.use_bias: - self.cls_bias = nn.Parameter(torch.ones(1) * use_bias) - - self.linear = nn.Linear(in_features, zs_weight_dim) - - if zs_weight_path == 'rand': - zs_weight = torch.randn((zs_weight_dim, num_classes)) - nn.init.normal_(zs_weight, std=0.01) - else: - zs_weight = torch.tensor( - np.load(zs_weight_path), - dtype=torch.float32).permute(1, 0).contiguous() # D x C - zs_weight = torch.cat( - [zs_weight, zs_weight.new_zeros( - (zs_weight_dim, 1))], dim=1) # D x (C + 1) - - if self.norm_weight: - zs_weight = F.normalize(zs_weight, p=2, dim=0) - - if zs_weight_path == 'rand': - self.zs_weight = nn.Parameter(zs_weight) - else: - self.register_buffer('zs_weight', zs_weight) - - assert self.zs_weight.shape[1] == num_classes + 1, self.zs_weight.shape - - def forward(self, x, classifier=None): - ''' - Inputs: - x: B x D' - classifier_info: (C', C' x D) - ''' - x = self.linear(x) - if classifier is not None: - zs_weight = classifier.permute(1, 0).contiguous() # D x C' - zs_weight = F.normalize(zs_weight, p=2, dim=0) \ - if self.norm_weight else zs_weight - else: - zs_weight = self.zs_weight - if self.norm_weight: - x = self.norm_temperature * F.normalize(x, p=2, dim=1) - x = torch.mm(x, zs_weight) - if self.use_bias: - x = x + self.cls_bias - return x diff --git a/projects/Detic_new/README.md b/projects/Detic_new/README.md deleted file mode 100644 index 3c7714c36..000000000 --- a/projects/Detic_new/README.md +++ /dev/null @@ -1,248 +0,0 @@ -# Detecting Twenty-thousand Classes using Image-level Supervision - -## Description - -**Detic**: A **Det**ector with **i**mage **c**lasses that can use image-level labels to easily train detectors. - -

- -> [**Detecting Twenty-thousand Classes using Image-level Supervision**](http://arxiv.org/abs/2201.02605), -> Xingyi Zhou, Rohit Girdhar, Armand Joulin, Philipp Krähenbühl, Ishan Misra, -> *ECCV 2022 ([arXiv 2201.02605](http://arxiv.org/abs/2201.02605))* - -## Usage - - - -## Installation - -Detic requires to install CLIP. - -```shell -pip install git+https://github.com/openai/CLIP.git -``` - -## Prepare Datasets - -It is recommended to download and extract the dataset somewhere outside the project directory and symlink the dataset root to `$MMDETECTION/data` as below. If your folder structure is different, you may need to change the corresponding paths in config files. - -### LVIS - -LVIS dataset is adopted as box-labeled data, [LVIS](https://www.lvisdataset.org/) is available from official website or mirror. You need to generate `lvis_v1_train_norare.json` according to the [official prepare datasets](https://github.com/facebookresearch/Detic/blob/main/datasets/README.md#coco-and-lvis) for open-vocabulary LVIS, which removes the labels of 337 rare-class from training. You can also download [lvis_v1_train_norare.json](https://download.openmmlab.com/mmdetection/v3.0/detic/data/lvis/annotations/lvis_v1_train_norare.json) from our backup. The directory should be like this. - -```shell -mmdetection -├── data -│ ├── lvis -│ │ ├── annotations -│ │ | ├── lvis_v1_train.json -│ │ | ├── lvis_v1_val.json -│ │ | ├── lvis_v1_train_norare.json -│ │ ├── train2017 -│ │ ├── val2017 -``` - -### ImageNet-LVIS - -ImageNet-LVIS is adopted as image-labeled data. You can download [ImageNet-21K](https://www.image-net.org/download.php) dataset from the official website. Then you need to unzip the overlapping classes of LVIS and convert them into LVIS annotation format according to the [official prepare datasets](https://github.com/facebookresearch/Detic/blob/main/datasets/README.md#imagenet-21k). The directory should be like this. - -```shell -mmdetection -├── data -│ ├── imagenet -│ │ ├── annotations -│ │ | ├── imagenet_lvis_image_info.json -│ │ ├── ImageNet-21K -│ │ | ├── n00007846 -│ │ | ├── n01318894 -│ │ | ├── ... -``` - -### Metadata - -`data/metadata/` is the preprocessed meta-data (included in the repo). Please follow the [official instruction](https://github.com/facebookresearch/Detic/blob/main/datasets/README.md#metadata) to pre-process the LVIS dataset. You will generate `lvis_v1_train_cat_info.json` for Federated loss, which contains the frequency of each category of training set of LVIS. In addition, `lvis_v1_clip_a+cname.npy` is the pre-computed CLIP embeddings for each category of LVIS. You can also choose to directly download [lvis_v1_train_cat_info](https://download.openmmlab.com/mmdetection/v3.0/detic/data/metadata/lvis_v1_train_cat_info.json) and [lvis_v1_clip_a+cname.npy](https://download.openmmlab.com/mmdetection/v3.0/detic/data/metadata/lvis_v1_clip_a%2Bcname.npy) form our backup. The directory should be like this. - -```shell -mmdetection -├── data -│ ├── metadata -│ │ ├── lvis_v1_train_cat_info.json -│ │ ├── lvis_v1_clip_a+cname.npy -``` - -## Demo - -Here we provide the Detic model for the open vocabulary demo. This model is trained on combined LVIS-COCO and ImageNet-21K for better demo purposes. LVIS models do not detect persons well due to its federated annotation protocol. LVIS+COCO models give better visual results. - -| Backbone | Training data | Config | Download | -| :------: | :----------------------------: | :-------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| Swin-B | LVIS & COCO & ImageNet-21K | [config](./configs/detic_centernet2_swin-b_fpn_4x_lvis_coco_in21k.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_swin-b_fpn_4x_lvis-coco-in21k/detic_centernet2_swin-b_fpn_4x_lvis-coco-in21k_20230120-0d301978.pth) | - -You can also download other models from [official model zoo](https://github.com/facebookresearch/Detic/blob/main/docs/MODEL_ZOO.md), and convert the format by run - -```shell -python tools/model_converters/detic_to_mmdet.py --src /path/to/detic_weight.pth --dst /path/to/mmdet_weight.pth -``` - -### Inference with existing dataset vocabulary - -You can detect classes of existing dataset with `--texts` command: - -```shell -python demo/image_demo.py \ - ${IMAGE_PATH} \ - ${CONFIG_PATH} \ - ${MODEL_PATH} \ - --texts lvis \ - --pred-score-thr 0.5 \ - --palette 'random' -``` - -![image](https://user-images.githubusercontent.com/12907710/213624759-f0a2ba0c-0f5c-4424-a350-5ba5349e5842.png) - -### Inference with custom vocabularies - -Detic can detects any class given class names by using CLIP. You can detect customized classes with `--texts` command: - -```shell -python demo/image_demo.py \ - ${IMAGE_PATH} \ - ${CONFIG_PATH} \ - ${MODEL_PATH} \ - --texts 'headphone . webcam . paper . coffe.' \ - --pred-score-thr 0.3 \ - --palette 'random' -``` - -![image](https://user-images.githubusercontent.com/12907710/213624637-e9e8a313-9821-4782-a18a-4408c876852b.png) - -Note that `headphone`, `paper` and `coffe` (typo intended) are not LVIS classes. Despite the misspelled class name, Detic can produce a reasonable detection for `coffe`. - -## Models and Results - -### Training - -There are two stages in the whole training process. The first stage is to train a model using images with box labels as the baseline. The second stage is to finetune from the baseline model and leverage image-labeled data. - -#### First stage - -To train the baseline with box-supervised, run - -```shell -bash ./tools/dist_train.sh projects/Detic_new/detic_centernet2_r50_fpn_4x_lvis_boxsup.py 8 -``` - -| Model (Config) | mask mAP | mask mAP(official) | mask mAP_rare | mask mAP_rare(officical) | -| :---------------------------------------------------------------------------------------------: | :------: | :----------------: | :-----------: | :----------------------: | -| [detic_centernet2_r50_fpn_4x_lvis_boxsup](./configs/detic_centernet2_r50_fpn_4x_lvis_boxsup.py) | 31.6 | 31.5 | 26.6 | 25.6 | - -#### Second stage - -The second stage uses both object detection and image classification datasets. - -##### Multi-Datasets Config - -We provide improved dataset_wrapper `ConcatDataset` to concatenate multiple datasets, all datasets could have different annotation types and different pipelines (e.g., image_size). You can also obtain the index of `dataset_source` for each sample through ` get_dataset_source` . We provide sampler `MultiDataSampler` to custom the ratios of different datasets. Beside, we provide batch_sampler `MultiDataAspectRatioBatchSampler` to enable different datasets to have different batchsizes. The config of multiple datasets is as follows: - -```python -dataset_det = dict( - type='ClassBalancedDataset', - oversample_thr=1e-3, - dataset=dict( - type='LVISV1Dataset', - data_root='data/lvis/', - ann_file='annotations/lvis_v1_train.json', - data_prefix=dict(img=''), - filter_cfg=dict(filter_empty_gt=True, min_size=32), - pipeline=train_pipeline_det, - backend_args=backend_args)) - -dataset_cls = dict( - type='ImageNetLVISV1Dataset', - data_root='data/imagenet', - ann_file='annotations/imagenet_lvis_image_info.json', - data_prefix=dict(img='ImageNet-LVIS/'), - pipeline=train_pipeline_cls, - backend_args=backend_args) - -train_dataloader = dict( - batch_size=[8, 32], - num_workers=2, - persistent_workers=True, - sampler=dict( - type='MultiDataSampler', - dataset_ratio=[1, 4]), - batch_sampler=dict( - type='MultiDataAspectRatioBatchSampler', - num_datasets=2), - dataset=dict( - type='ConcatDataset', - datasets=[dataset_det, dataset_cls])) -``` - -###### Note: - -- If the one of the multiple datasets is `ConcatDataset` , it is still considered as a dataset for `num_datasets` in `MultiDataAspectRatioBatchSampler`. - -To finetune the baseline model with image-labeled data, run: - -```shell -bash ./tools/dist_train.sh projects/Detic_new/detic_centernet2_r50_fpn_4x_lvis_in21k-lvis.py 8 -``` - -| Model (Config) | mask mAP | mask mAP(official) | mask mAP_rare | mask mAP_rare(officical) | -| :-----------------------------------------------------------------------------------------------------: | :------: | :----------------: | :-----------: | :----------------------: | -| [detic_centernet2_r50_fpn_4x_lvis_in21k-lvis](./configs/detic_centernet2_r50_fpn_4x_lvis_in21k-lvis.py) | 32.9 | 33.2 | 30.9 | 29.7 | - -#### Standard LVIS Results - -| Model (Config) | mask mAP | mask mAP(official) | mask mAP_rare | mask mAP_rare(officical) | Download | -| :-----------------------------------------------------------------------------------------------------------: | :------: | :----------------: | :-----------: | :----------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| [detic_centernet2_r50_fpn_4x_lvis_boxsup](./configs/detic_centernet2_r50_fpn_4x_lvis_boxsup.py) | 31.6 | 31.5 | 26.6 | 25.6 | [model](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_r50_fpn_4x_lvis_boxsup/detic_centernet2_r50_fpn_4x_lvis_boxsup_20230911_233514-54116677.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_r50_fpn_4x_lvis_boxsup/detic_centernet2_r50_fpn_4x_lvis_boxsup_20230911_233514.log.json) | -| [detic_centernet2_r50_fpn_4x_lvis_in21k-lvis](./configs/detic_centernet2_r50_fpn_4x_lvis_in21k-lvis.py) | 32.9 | 33.2 | 30.9 | 29.7 | [model](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_r50_fpn_4x_lvis_in21k-lvis/detic_centernet2_r50_fpn_4x_lvis_in21k-lvis_20230912_040619-9e7a3258.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_r50_fpn_4x_lvis_in21k-lvis/detic_centernet2_r50_fpn_4x_lvis_in21k-lvis_20230912_040619.log.json) | -| [detic_centernet2_swin-b_fpn_4x_lvis_boxsup](./configs/detic_centernet2_swin-b_fpn_4x_lvis_boxsup.py) | 40.7 | 40.7 | 38.0 | 35.9 | [model](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_swin-b_fpn_4x_lvis_boxsup/detic_centernet2_swin-b_fpn_4x_lvis_boxsup_20230825_061737-328e85f9.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_swin-b_fpn_4x_lvis_boxsup/detic_centernet2_swin-b_fpn_4x_lvis_boxsup_20230825_061737.log.json) | -| [detic_centernet2_swin-b_fpn_4x_lvis_in21k-lvis](./configs/detic_centernet2_swin-b_fpn_4x_lvis_in21k-lvis.py) | 41.7 | 41.7 | 41.7 | 41.7 | [model](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_swin-b_fpn_4x_lvis_in21k-lvis/detic_centernet2_swin-b_fpn_4x_lvis_in21k-lvis_20230926_235410-0c152391.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_swin-b_fpn_4x_lvis_in21k-lvis/detic_centernet2_swin-b_fpn_4x_lvis_in21k-lvis_20230926_235410.log.json) | - -#### Open-vocabulary LVIS Results - -| Model (Config) | mask mAP | mask mAP(official) | mask mAP_rare | mask mAP_rare(officical) | Download | -| :---------------------------------------------------------------------------------------------------------------: | :------: | :----------------: | :-----------: | :----------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| [detic_centernet2_r50_fpn_4x_lvis-base_boxsup](./configs/detic_centernet2_r50_fpn_4x_lvis-base_boxsup.py) | 30.4 | 30.2 | 16.2 | 16.4 | [model](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_r50_fpn_4x_lvis-base_boxsup/detic_centernet2_r50_fpn_4x_lvis-base_boxsup_20230921_180638-c1685ee2.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_r50_fpn_4x_lvis-base_boxsup/detic_centernet2_r50_fpn_4x_lvis-base_boxsup_20230921_180638.log.json) | -| [detic_centernet2_r50_fpn_4x_lvis-base_in21k-lvis](./configs/detic_centernet2_r50_fpn_4x_lvis-base_in21k-lvis.py) | 32.6 | 32.4 | 27.4 | 24.9 | [model](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_r50_fpn_4x_lvis-base_in21k-lvis/detic_centernet2_r50_fpn_4x_lvis-base_in21k-lvis_20230925_014315-2d2cc8b7.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_r50_fpn_4x_lvis-base_in21k-lvis/detic_centernet2_r50_fpn_4x_lvis-base_in21k-lvis_20230925_014315.log.json) | - -### Testing - -#### Test Command - -To evaluate a model with a trained model, run - -```shell -python ./tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} -``` - -#### Open-vocabulary LVIS Results - -The models are converted from the official model zoo. - -| Model (Config) | mask mAP | mask mAP_novel | Download | -| :---------------------------------------------------------------------------------------------------------------------: | :------: | :------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| [detic_centernet2_swin-b_fpn_4x_lvis-base_boxsup](./configs/detic_centernet2_swin-b_fpn_4x_lvis-base_boxsup.py) | 38.4 | 21.9 | [model](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_swin-b_fpn_4x_lvis-base_boxsup/detic_centernet2_swin-b_fpn_4x_lvis-base_boxsup-481281c8.pth) | -| [detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis](./configs/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis.py) | 40.7 | 34.0 | [model](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis-ec91245d.pth) | - -###### Note: - -- The open-vocabulary LVIS setup is LVIS without rare class annotations in training, termed `lvisbase`. We evaluate rare classes as novel classes in testing. -- ` in21k-lvis` denotes that the model use the overlap classes between ImageNet-21K and LVIS as image-labeled data. - -## Citation - -If you find Detic is useful in your research or applications, please consider giving a star 🌟 to the [official repository](https://github.com/facebookresearch/Detic) and citing Detic by the following BibTeX entry. - -```BibTeX -@inproceedings{zhou2022detecting, - title={Detecting Twenty-thousand Classes using Image-level Supervision}, - author={Zhou, Xingyi and Girdhar, Rohit and Joulin, Armand and Kr{\"a}henb{\"u}hl, Philipp and Misra, Ishan}, - booktitle={ECCV}, - year={2022} -} -``` diff --git a/projects/Detic_new/configs/detic_centernet2_r50_fpn_4x_lvis-base_boxsup.py b/projects/Detic_new/configs/detic_centernet2_r50_fpn_4x_lvis-base_boxsup.py deleted file mode 100644 index 8ca57b77d..000000000 --- a/projects/Detic_new/configs/detic_centernet2_r50_fpn_4x_lvis-base_boxsup.py +++ /dev/null @@ -1,9 +0,0 @@ -_base_ = './detic_centernet2_r50_fpn_4x_lvis_boxsup.py' - -# 'lvis_v1_train_norare.json' is the annotations of lvis_v1 -# removing the labels of 337 rare-class -train_dataloader = dict( - dataset=dict( - type='ClassBalancedDataset', - oversample_thr=1e-3, - dataset=dict(ann_file='annotations/lvis_v1_train_norare.json'))) diff --git a/projects/Detic_new/configs/detic_centernet2_r50_fpn_4x_lvis-base_in21k-lvis.py b/projects/Detic_new/configs/detic_centernet2_r50_fpn_4x_lvis-base_in21k-lvis.py deleted file mode 100644 index 034acb6eb..000000000 --- a/projects/Detic_new/configs/detic_centernet2_r50_fpn_4x_lvis-base_in21k-lvis.py +++ /dev/null @@ -1,93 +0,0 @@ -_base_ = './detic_centernet2_r50_fpn_4x_lvis_boxsup.py' -dataset_type = ['LVISV1Dataset', 'ImageNetLVISV1Dataset'] -image_size_det = (640, 640) -image_size_cls = (320, 320) - -# backend = 'pillow' -backend_args = None - -train_pipeline_det = [ - dict(type='LoadImageFromFile', backend_args=backend_args), - dict(type='LoadAnnotations', with_bbox=True, with_mask=True), - dict( - type='RandomResize', - scale=image_size_det, - ratio_range=(0.1, 2.0), - keep_ratio=True), - dict( - type='RandomCrop', - crop_type='absolute_range', - crop_size=image_size_det, - recompute_bbox=True, - allow_negative_crop=True), - dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), - dict(type='RandomFlip', prob=0.5), - dict(type='PackDetInputs') -] - -train_pipeline_cls = [ - dict(type='LoadImageFromFile', backend_args=backend_args), - dict(type='LoadAnnotations', with_bbox=False, with_label=True), - dict( - type='RandomResize', - scale=image_size_cls, - ratio_range=(0.5, 1.5), - keep_ratio=True), - dict( - type='RandomCrop', - crop_type='absolute_range', - crop_size=image_size_cls, - recompute_bbox=False, - bbox_clip_border=False, - allow_negative_crop=True), - dict(type='RandomFlip', prob=0.5), - dict(type='PackDetInputs') -] - -# 'lvis_v1_train_norare.json' is the annotations of lvis_v1 -# removing the labels of 337 rare-class -dataset_det = dict( - type='ClassBalancedDataset', - oversample_thr=1e-3, - dataset=dict( - type='LVISV1Dataset', - data_root='data/lvis/', - ann_file='annotations/lvis_v1_train_norare.json', - data_prefix=dict(img=''), - filter_cfg=dict(filter_empty_gt=True, min_size=32), - pipeline=train_pipeline_det, - backend_args=backend_args)) - -dataset_cls = dict( - type='ImageNetLVISV1Dataset', - data_root='data/imagenet', - ann_file='annotations/imagenet_lvis_image_info.json', - data_prefix=dict(img='ImageNet-LVIS/'), - pipeline=train_pipeline_cls, - backend_args=backend_args) - -train_dataloader = dict( - _delete_=True, - batch_size=[8, 32], - num_workers=2, - persistent_workers=True, - sampler=dict(type='MultiDataSampler', dataset_ratio=[1, 4]), - batch_sampler=dict( - type='MultiDataAspectRatioBatchSampler', num_datasets=2), - dataset=dict(type='ConcatDataset', datasets=[dataset_det, dataset_cls])) - -param_scheduler = [ - dict( - type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, - end=1000), - dict( - type='CosineAnnealingLR', - begin=0, - by_epoch=False, - T_max=90000, - ) -] - -load_from = './first_stage/detic_centernet2_r50_fpn_4x_lvis-base_boxsup.pth' - -find_unused_parameters = True diff --git a/projects/Detic_new/configs/detic_centernet2_r50_fpn_4x_lvis_boxsup.py b/projects/Detic_new/configs/detic_centernet2_r50_fpn_4x_lvis_boxsup.py deleted file mode 100644 index a11be374c..000000000 --- a/projects/Detic_new/configs/detic_centernet2_r50_fpn_4x_lvis_boxsup.py +++ /dev/null @@ -1,410 +0,0 @@ -_base_ = 'mmdet::_base_/default_runtime.py' -dataset_type = 'LVISV1Dataset' -custom_imports = dict( - imports=['projects.Detic_new.detic'], allow_failed_imports=False) - -num_classes = 1203 -lvis_cat_frequency_info = 'data/metadata/lvis_v1_train_cat_info.json' - -# 'data/metadata/lvis_v1_clip_a+cname.npy' is pre-computed -# CLIP embeddings for each category -cls_layer = dict( - type='ZeroShotClassifier', - zs_weight_path='data/metadata/lvis_v1_clip_a+cname.npy', - zs_weight_dim=512, - use_bias=0.0, - norm_weight=True, - norm_temperature=50.0) -reg_layer = [ - dict(type='Linear', in_features=1024, out_features=1024), - dict(type='ReLU', inplace=True), - dict(type='Linear', in_features=1024, out_features=4) -] - -model = dict( - type='Detic', - data_preprocessor=dict( - type='DetDataPreprocessor', - mean=[123.675, 116.28, 103.53], - std=[58.395, 57.12, 57.375], - bgr_to_rgb=True, - pad_size_divisor=32), - backbone=dict( - type='ResNet', - depth=50, - num_stages=4, - out_indices=(1, 2, 3), - norm_cfg=dict(type='BN', requires_grad=False), - norm_eval=True, - init_cfg=dict( - type='Pretrained', - checkpoint='https://miil-public-eu.oss-eu-central-1.aliyuncs.com/' - 'model-zoo/ImageNet_21K_P/models/resnet50_miil_21k.pth')), - neck=dict( - type='FPN', - in_channels=[512, 1024, 2048], - out_channels=256, - start_level=0, - add_extra_convs='on_output', - num_outs=5, - init_cfg=dict(type='Caffe2Xavier', layer='Conv2d'), - relu_before_extra_convs=True), - rpn_head=dict( - type='CenterNetRPNHead', - num_classes=1, - in_channels=256, - stacked_convs=4, - feat_channels=256, - strides=[8, 16, 32, 64, 128], - conv_bias=True, - norm_cfg=dict(type='GN', num_groups=32, requires_grad=True), - loss_cls=dict( - type='HeatmapFocalLoss', - alpha=0.25, - beta=4.0, - gamma=2.0, - pos_weight=0.5, - neg_weight=0.5, - loss_weight=1.0, - ignore_high_fp=0.85, - ), - loss_bbox=dict(type='GIoULoss', eps=1e-6, loss_weight=1.0), - ), - roi_head=dict( - type='DeticRoIHead', - num_stages=3, - stage_loss_weights=[1.0, 1.0, 1.0], - bbox_roi_extractor=dict( - type='SingleRoIExtractor', - roi_layer=dict( - type='RoIAlign', - output_size=7, - sampling_ratio=0, - use_torchvision=True), - out_channels=256, - featmap_strides=[8, 16, 32], - # approximately equal to - # canonical_box_size=224, canonical_level=4 in D2 - finest_scale=112), - bbox_head=[ - dict( - type='DeticBBoxHead', - in_channels=256, - fc_out_channels=1024, - roi_feat_size=7, - num_classes=num_classes, - cls_predictor_cfg=cls_layer, - reg_predictor_cfg=reg_layer, - use_fed_loss=True, - cat_freq_path=lvis_cat_frequency_info, - bbox_coder=dict( - type='DeltaXYWHBBoxCoder', - target_means=[0., 0., 0., 0.], - target_stds=[0.1, 0.1, 0.2, 0.2]), - reg_class_agnostic=True, - loss_cls=dict( - type='CrossEntropyLoss', use_sigmoid=True, - loss_weight=1.0), - loss_bbox=dict(type='SmoothL1Loss', beta=0.1, - loss_weight=1.0)), - dict( - type='DeticBBoxHead', - in_channels=256, - fc_out_channels=1024, - roi_feat_size=7, - num_classes=num_classes, - cls_predictor_cfg=cls_layer, - reg_predictor_cfg=reg_layer, - use_fed_loss=True, - cat_freq_path=lvis_cat_frequency_info, - bbox_coder=dict( - type='DeltaXYWHBBoxCoder', - target_means=[0., 0., 0., 0.], - target_stds=[0.05, 0.05, 0.1, 0.1]), - reg_class_agnostic=True, - loss_cls=dict( - type='CrossEntropyLoss', use_sigmoid=True, - loss_weight=1.0), - loss_bbox=dict(type='SmoothL1Loss', beta=0.1, - loss_weight=1.0)), - dict( - type='DeticBBoxHead', - in_channels=256, - fc_out_channels=1024, - roi_feat_size=7, - num_classes=num_classes, - cls_predictor_cfg=cls_layer, - reg_predictor_cfg=reg_layer, - use_fed_loss=True, - cat_freq_path=lvis_cat_frequency_info, - bbox_coder=dict( - type='DeltaXYWHBBoxCoder', - target_means=[0., 0., 0., 0.], - target_stds=[0.033, 0.033, 0.067, 0.067]), - reg_class_agnostic=True, - loss_cls=dict( - type='CrossEntropyLoss', use_sigmoid=True, - loss_weight=1.0), - loss_bbox=dict(type='SmoothL1Loss', beta=0.1, loss_weight=1.0)) - ], - mask_roi_extractor=dict( - type='SingleRoIExtractor', - roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), - out_channels=256, - featmap_strides=[8, 16, 32], - # approximately equal to - # canonical_box_size=224, canonical_level=4 in D2 - finest_scale=112), - mask_head=dict( - type='FCNMaskHead', - num_convs=4, - in_channels=256, - conv_out_channels=256, - class_agnostic=True, - num_classes=num_classes, - loss_mask=dict( - type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))), - # model training and testing settings - train_cfg=dict( - rpn=dict( - assigner=dict( - type='MaxIoUAssigner', - pos_iou_thr=0.7, - neg_iou_thr=0.3, - min_pos_iou=0.3, - match_low_quality=True, - ignore_iof_thr=-1), - sampler=dict( - type='RandomSampler', - num=256, - pos_fraction=0.5, - neg_pos_ub=-1, - add_gt_as_proposals=False), - allowed_border=0, - pos_weight=-1, - debug=False), - rpn_proposal=dict( - score_thr=0.0001, - nms_pre=4000, - max_per_img=2000, - nms=dict(type='nms', iou_threshold=0.9), - min_bbox_size=0), - rcnn=[ - dict( - assigner=dict( - type='MaxIoUAssigner', - pos_iou_thr=0.6, - neg_iou_thr=0.6, - min_pos_iou=0.6, - match_low_quality=False, - ignore_iof_thr=-1), - sampler=dict( - type='RandomSampler', - num=512, - pos_fraction=0.25, - neg_pos_ub=-1, - add_gt_as_proposals=True), - mask_size=28, - pos_weight=-1, - debug=False), - dict( - assigner=dict( - type='MaxIoUAssigner', - pos_iou_thr=0.7, - neg_iou_thr=0.7, - min_pos_iou=0.7, - match_low_quality=False, - ignore_iof_thr=-1), - sampler=dict( - type='RandomSampler', - num=512, - pos_fraction=0.25, - neg_pos_ub=-1, - add_gt_as_proposals=False), - mask_size=28, - pos_weight=-1, - debug=False), - dict( - assigner=dict( - type='MaxIoUAssigner', - pos_iou_thr=0.8, - neg_iou_thr=0.8, - min_pos_iou=0.8, - match_low_quality=False, - ignore_iof_thr=-1), - sampler=dict( - type='RandomSampler', - num=512, - pos_fraction=0.25, - neg_pos_ub=-1, - add_gt_as_proposals=False), - mask_size=28, - pos_weight=-1, - debug=False) - ]), - test_cfg=dict( - rpn=dict( - score_thr=0.0001, - nms_pre=1000, - max_per_img=256, - nms=dict(type='nms', iou_threshold=0.9), - min_bbox_size=0), - rcnn=dict( - score_thr=0.02, - nms=dict(type='nms', iou_threshold=0.5), - max_per_img=300, - mask_thr_binary=0.5))) - -# backend = 'pillow' -backend_args = None - -train_pipeline = [ - dict(type='LoadImageFromFile', backend_args=backend_args), - dict(type='LoadAnnotations', with_bbox=True, with_mask=True), - dict( - type='RandomResize', - scale=(640, 640), - ratio_range=(0.1, 2.0), - keep_ratio=True), - dict( - type='RandomCrop', - crop_type='absolute_range', - crop_size=(640, 640), - recompute_bbox=True, - allow_negative_crop=True), - dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), - dict(type='RandomFlip', prob=0.5), - dict(type='PackDetInputs') -] - -test_pipeline = [ - dict( - type='LoadImageFromFile', - backend_args=backend_args, - imdecode_backend=backend_args), - dict( - type='Resize', - scale=(1333, 800), - keep_ratio=True, - backend=backend_args), - dict( - type='LoadAnnotations', - with_bbox=True, - with_mask=True, - poly2mask=False), - dict( - type='PackDetInputs', - meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', - 'scale_factor', 'text', 'custom_entities')) -] - -val_pipeline = [ - dict( - type='LoadImageFromFile', - backend_args=backend_args, - imdecode_backend=backend_args), - dict( - type='Resize', - scale=(1333, 800), - keep_ratio=True, - backend=backend_args), - dict( - type='LoadAnnotations', - with_bbox=True, - with_mask=True, - poly2mask=False), - dict( - type='PackDetInputs', - meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', - 'scale_factor')) -] - -train_dataloader = dict( - batch_size=8, - num_workers=2, - persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=True), - batch_sampler=dict(type='AspectRatioBatchSampler'), - dataset=dict( - type='ClassBalancedDataset', - oversample_thr=1e-3, - dataset=dict( - type='LVISV1Dataset', - data_root='data/lvis/', - ann_file='annotations/lvis_v1_train_norare.json', - data_prefix=dict(img=''), - filter_cfg=dict(filter_empty_gt=True, min_size=32), - pipeline=train_pipeline, - backend_args=backend_args))) - -val_dataloader = dict( - batch_size=8, - num_workers=2, - persistent_workers=True, - drop_last=False, - pin_memory=True, - sampler=dict(type='DefaultSampler', shuffle=False), - dataset=dict( - type='LVISV1Dataset', - data_root='data/lvis/', - ann_file='annotations/lvis_v1_val.json', - data_prefix=dict(img=''), - pipeline=val_pipeline, - return_classes=False)) - -test_dataloader = dict( - batch_size=8, - num_workers=2, - persistent_workers=True, - drop_last=False, - pin_memory=True, - sampler=dict(type='DefaultSampler', shuffle=False), - dataset=dict( - type='LVISV1Dataset', - data_root='data/lvis/', - ann_file='annotations/lvis_v1_val.json', - data_prefix=dict(img=''), - pipeline=test_pipeline, - return_classes=True)) - -val_evaluator = dict( - type='LVISMetric', - ann_file='data/lvis/annotations/lvis_v1_val.json', - metric=['bbox', 'segm']) -test_evaluator = val_evaluator - -# training schedule for 90k with batch_size of 64 -# with total batch_size of 16, 90k iters is equivalent to '1x' (12 epochs) -# with total batch_size of 64, 90k iters is equivalent to '4x' -max_iter = 90000 -train_cfg = dict( - type='IterBasedTrainLoop', max_iters=max_iter, val_interval=90000) -val_cfg = dict(type='ValLoop') -test_cfg = dict(type='TestLoop') - -# Enable automatic-mixed-precision training with AmpOptimWrapper. -optim_wrapper = dict( - type='OptimWrapper', - optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.0001), - paramwise_cfg=dict(norm_decay_mult=0.), - clip_grad=dict(max_norm=1.0, norm_type=2)) - -param_scheduler = [ - dict( - type='LinearLR', - start_factor=0.0001, - by_epoch=False, - begin=0, - end=10000), - dict( - type='CosineAnnealingLR', - begin=0, - by_epoch=False, - T_max=max_iter, - ) -] - -# only keep latest 5 checkpoints -default_hooks = dict( - checkpoint=dict(by_epoch=False, interval=30000, max_keep_ckpts=5), - logger=dict(type='LoggerHook', interval=50)) diff --git a/projects/Detic_new/configs/detic_centernet2_r50_fpn_4x_lvis_in21k-lvis.py b/projects/Detic_new/configs/detic_centernet2_r50_fpn_4x_lvis_in21k-lvis.py deleted file mode 100644 index ce97ed6d5..000000000 --- a/projects/Detic_new/configs/detic_centernet2_r50_fpn_4x_lvis_in21k-lvis.py +++ /dev/null @@ -1,91 +0,0 @@ -_base_ = './detic_centernet2_r50_fpn_4x_lvis_boxsup.py' -dataset_type = ['LVISV1Dataset', 'ImageNetLVISV1Dataset'] -image_size_det = (640, 640) -image_size_cls = (320, 320) - -# backend = 'pillow' -backend_args = None - -train_pipeline_det = [ - dict(type='LoadImageFromFile', backend_args=backend_args), - dict(type='LoadAnnotations', with_bbox=True, with_mask=True), - dict( - type='RandomResize', - scale=image_size_det, - ratio_range=(0.1, 2.0), - keep_ratio=True), - dict( - type='RandomCrop', - crop_type='absolute_range', - crop_size=image_size_det, - recompute_bbox=True, - allow_negative_crop=True), - dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), - dict(type='RandomFlip', prob=0.5), - dict(type='PackDetInputs') -] - -train_pipeline_cls = [ - dict(type='LoadImageFromFile', backend_args=backend_args), - dict(type='LoadAnnotations', with_bbox=False, with_label=True), - dict( - type='RandomResize', - scale=image_size_cls, - ratio_range=(0.5, 1.5), - keep_ratio=True), - dict( - type='RandomCrop', - crop_type='absolute_range', - crop_size=image_size_cls, - recompute_bbox=False, - bbox_clip_border=False, - allow_negative_crop=True), - dict(type='RandomFlip', prob=0.5), - dict(type='PackDetInputs') -] - -dataset_det = dict( - type='ClassBalancedDataset', - oversample_thr=1e-3, - dataset=dict( - type='LVISV1Dataset', - data_root='data/lvis/', - ann_file='annotations/lvis_v1_train.json', - data_prefix=dict(img=''), - filter_cfg=dict(filter_empty_gt=True, min_size=32), - pipeline=train_pipeline_det, - backend_args=backend_args)) - -dataset_cls = dict( - type='ImageNetLVISV1Dataset', - data_root='data/imagenet', - ann_file='annotations/imagenet_lvis_image_info.json', - data_prefix=dict(img='ImageNet-LVIS/'), - pipeline=train_pipeline_cls, - backend_args=backend_args) - -train_dataloader = dict( - _delete_=True, - batch_size=[8, 32], - num_workers=2, - persistent_workers=True, - sampler=dict(type='MultiDataSampler', dataset_ratio=[1, 4]), - batch_sampler=dict( - type='MultiDataAspectRatioBatchSampler', num_datasets=2), - dataset=dict(type='ConcatDataset', datasets=[dataset_det, dataset_cls])) - -param_scheduler = [ - dict( - type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, - end=1000), - dict( - type='CosineAnnealingLR', - begin=0, - by_epoch=False, - T_max=90000, - ) -] - -load_from = './first_stage/detic_centernet2_r50_fpn_4x_lvis_boxsup.pth' - -find_unused_parameters = True diff --git a/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis-base_boxsup.py b/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis-base_boxsup.py deleted file mode 100644 index efedd111e..000000000 --- a/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis-base_boxsup.py +++ /dev/null @@ -1,9 +0,0 @@ -_base_ = './detic_centernet2_swin-b_fpn_4x_lvis_boxsup.py' - -# 'lvis_v1_train_norare.json' is the annotations of lvis_v1 -# removing the labels of 337 rare-class -train_dataloader = dict( - dataset=dict( - type='ClassBalancedDataset', - oversample_thr=1e-3, - dataset=dict(ann_file='annotations/lvis_v1_train_norare.json'))) diff --git a/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis.py b/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis.py deleted file mode 100644 index 1df70970e..000000000 --- a/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis.py +++ /dev/null @@ -1,118 +0,0 @@ -_base_ = './detic_centernet2_r50_fpn_4x_lvis_in21k-lvis.py' - -image_size_det = (896, 896) -image_size_cls = (448, 448) - -model = dict( - backbone=dict( - _delete_=True, - type='SwinTransformer', - embed_dims=128, - depths=[2, 2, 18, 2], - num_heads=[4, 8, 16, 32], - window_size=7, - mlp_ratio=4, - qkv_bias=True, - qk_scale=None, - drop_rate=0., - attn_drop_rate=0., - drop_path_rate=0.3, - patch_norm=True, - out_indices=(1, 2, 3), - with_cp=False), - neck=dict(in_channels=[256, 512, 1024])) - -backend_args = None -train_pipeline_det = [ - dict(type='LoadImageFromFile', backend_args=backend_args), - dict(type='LoadAnnotations', with_bbox=True, with_mask=True), - dict( - type='RandomResize', - scale=image_size_det, - ratio_range=(0.1, 2.0), - keep_ratio=True), - dict( - type='RandomCrop', - crop_type='absolute_range', - crop_size=image_size_det, - recompute_bbox=True, - allow_negative_crop=True), - dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), - dict(type='RandomFlip', prob=0.5), - dict(type='PackDetInputs') -] - -train_pipeline_cls = [ - dict(type='LoadImageFromFile', backend_args=backend_args), - dict(type='LoadAnnotations', with_bbox=False, with_label=True), - dict( - type='RandomResize', - scale=image_size_cls, - ratio_range=(0.5, 1.5), - keep_ratio=True), - dict( - type='RandomCrop', - crop_type='absolute_range', - crop_size=image_size_cls, - recompute_bbox=False, - bbox_clip_border=False, - allow_negative_crop=True), - dict(type='RandomFlip', prob=0.5), - dict(type='PackDetInputs') -] - -# 'lvis_v1_train_norare.json' is the annotations of lvis_v1 -# removing the labels of 337 rare-class -dataset_det = dict( - type='ClassBalancedDataset', - oversample_thr=1e-3, - dataset=dict( - type='LVISV1Dataset', - data_root='data/lvis/', - ann_file='annotations/lvis_v1_train_norare.json', - data_prefix=dict(img=''), - filter_cfg=dict(filter_empty_gt=True, min_size=32), - pipeline=train_pipeline_det, - backend_args=backend_args)) - -dataset_cls = dict( - type='ImageNetLVISV1Dataset', - data_root='data/imagenet', - ann_file='annotations/imagenet_lvis_image_info.json', - data_prefix=dict(img='ImageNet-LVIS/'), - pipeline=train_pipeline_cls, - backend_args=backend_args) - -train_dataloader = dict( - _delete_=True, - batch_size=[4, 16], - num_workers=2, - persistent_workers=True, - sampler=dict(type='MultiDataSampler', dataset_ratio=[1, 4]), - batch_sampler=dict( - type='MultiDataAspectRatioBatchSampler', num_datasets=2), - dataset=dict(type='ConcatDataset', datasets=[dataset_det, dataset_cls])) - -# training schedule for 180k -max_iter = 180000 -train_cfg = dict( - type='IterBasedTrainLoop', max_iters=max_iter, val_interval=180000) - -optim_wrapper = dict( - type='OptimWrapper', - optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0001)) - -param_scheduler = [ - dict( - type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, - end=1000), - dict( - type='CosineAnnealingLR', - begin=0, - by_epoch=False, - T_max=max_iter, - ) -] - -load_from = './first_stage/detic_centernet2_swin-b_fpn_4x_lvis-base_boxsup.pth' -find_unused_parameters = True diff --git a/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis_boxsup.py b/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis_boxsup.py deleted file mode 100644 index ce04a815f..000000000 --- a/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis_boxsup.py +++ /dev/null @@ -1,78 +0,0 @@ -_base_ = './detic_centernet2_r50_fpn_4x_lvis_boxsup.py' - -model = dict( - backbone=dict( - _delete_=True, - type='SwinTransformer', - embed_dims=128, - depths=[2, 2, 18, 2], - num_heads=[4, 8, 16, 32], - window_size=7, - mlp_ratio=4, - qkv_bias=True, - qk_scale=None, - drop_rate=0., - attn_drop_rate=0., - drop_path_rate=0.3, - patch_norm=True, - out_indices=(1, 2, 3), - with_cp=False, - convert_weights=True, - init_cfg=dict( - type='Pretrained', - checkpoint='https://github.com/SwinTransformer/storage/releases/' - 'download/v1.0.0/swin_base_patch4_window7_224_22k.pth')), - neck=dict(in_channels=[256, 512, 1024])) - -# backend = 'pillow' -backend_args = None - -train_pipeline = [ - dict(type='LoadImageFromFile', backend_args=backend_args), - dict(type='LoadAnnotations', with_bbox=True, with_mask=True), - dict( - type='RandomResize', - scale=(896, 896), - ratio_range=(0.1, 2.0), - keep_ratio=True), - dict( - type='RandomCrop', - crop_type='absolute_range', - crop_size=(896, 896), - recompute_bbox=True, - allow_negative_crop=True), - dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), - dict(type='RandomFlip', prob=0.5), - dict(type='PackDetInputs') -] - -train_dataloader = dict( - dataset=dict( - type='ClassBalancedDataset', - oversample_thr=1e-3, - dataset=dict(pipeline=train_pipeline))) - -# training schedule for 180k -max_iter = 180000 -train_cfg = dict( - type='IterBasedTrainLoop', max_iters=max_iter, val_interval=180000) - -# Enable automatic-mixed-precision training with AmpOptimWrapper. -optim_wrapper = dict( - type='OptimWrapper', - optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0001)) - -param_scheduler = [ - dict( - type='LinearLR', - start_factor=0.0001, - by_epoch=False, - begin=0, - end=10000), - dict( - type='CosineAnnealingLR', - begin=0, - by_epoch=False, - T_max=max_iter, - ) -] diff --git a/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis_coco_in21k.py b/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis_coco_in21k.py deleted file mode 100644 index a9ab2c69a..000000000 --- a/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis_coco_in21k.py +++ /dev/null @@ -1,2 +0,0 @@ -# not support training, only for testing -_base_ = './detic_centernet2_swin-b_fpn_4x_lvis_in21k-lvis.py' diff --git a/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis_in21k-lvis.py b/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis_in21k-lvis.py deleted file mode 100644 index de358ac34..000000000 --- a/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis_in21k-lvis.py +++ /dev/null @@ -1,116 +0,0 @@ -_base_ = './detic_centernet2_r50_fpn_4x_lvis_in21k-lvis.py' - -image_size_det = (896, 896) -image_size_cls = (448, 448) - -model = dict( - backbone=dict( - _delete_=True, - type='SwinTransformer', - embed_dims=128, - depths=[2, 2, 18, 2], - num_heads=[4, 8, 16, 32], - window_size=7, - mlp_ratio=4, - qkv_bias=True, - qk_scale=None, - drop_rate=0., - attn_drop_rate=0., - drop_path_rate=0.3, - patch_norm=True, - out_indices=(1, 2, 3), - with_cp=False), - neck=dict(in_channels=[256, 512, 1024])) - -backend_args = None -train_pipeline_det = [ - dict(type='LoadImageFromFile', backend_args=backend_args), - dict(type='LoadAnnotations', with_bbox=True, with_mask=True), - dict( - type='RandomResize', - scale=image_size_det, - ratio_range=(0.1, 2.0), - keep_ratio=True), - dict( - type='RandomCrop', - crop_type='absolute_range', - crop_size=image_size_det, - recompute_bbox=True, - allow_negative_crop=True), - dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), - dict(type='RandomFlip', prob=0.5), - dict(type='PackDetInputs') -] - -train_pipeline_cls = [ - dict(type='LoadImageFromFile', backend_args=backend_args), - dict(type='LoadAnnotations', with_bbox=False, with_label=True), - dict( - type='RandomResize', - scale=image_size_cls, - ratio_range=(0.5, 1.5), - keep_ratio=True), - dict( - type='RandomCrop', - crop_type='absolute_range', - crop_size=image_size_cls, - recompute_bbox=False, - bbox_clip_border=False, - allow_negative_crop=True), - dict(type='RandomFlip', prob=0.5), - dict(type='PackDetInputs') -] - -dataset_det = dict( - type='ClassBalancedDataset', - oversample_thr=1e-3, - dataset=dict( - type='LVISV1Dataset', - data_root='data/lvis/', - ann_file='annotations/lvis_v1_train.json', - data_prefix=dict(img=''), - filter_cfg=dict(filter_empty_gt=True, min_size=32), - pipeline=train_pipeline_det, - backend_args=backend_args)) - -dataset_cls = dict( - type='ImageNetLVISV1Dataset', - data_root='data/imagenet', - ann_file='annotations/imagenet_lvis_image_info.json', - data_prefix=dict(img='ImageNet-LVIS/'), - pipeline=train_pipeline_cls, - backend_args=backend_args) - -train_dataloader = dict( - _delete_=True, - batch_size=[4, 16], - num_workers=2, - persistent_workers=True, - sampler=dict(type='MultiDataSampler', dataset_ratio=[1, 4]), - batch_sampler=dict( - type='MultiDataAspectRatioBatchSampler', num_datasets=2), - dataset=dict(type='ConcatDataset', datasets=[dataset_det, dataset_cls])) - -# training schedule for 180k -max_iter = 180000 -train_cfg = dict( - type='IterBasedTrainLoop', max_iters=max_iter, val_interval=180000) - -optim_wrapper = dict( - type='OptimWrapper', - optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0001)) - -param_scheduler = [ - dict( - type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, - end=1000), - dict( - type='CosineAnnealingLR', - begin=0, - by_epoch=False, - T_max=max_iter, - ) -] - -load_from = './first_stage/detic_centernet2_swin-b_fpn_4x_lvis_boxsup.pth' -find_unused_parameters = True diff --git a/projects/Detic_new/detic/__init__.py b/projects/Detic_new/detic/__init__.py deleted file mode 100644 index e4b0d7bb8..000000000 --- a/projects/Detic_new/detic/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from .centernet_rpn_head import CenterNetRPNHead -from .detic import Detic -from .detic_bbox_head import DeticBBoxHead -from .detic_roi_head import DeticRoIHead -from .heatmap_focal_loss import HeatmapFocalLoss -from .imagenet_lvis import ImageNetLVISV1Dataset -from .zero_shot_classifier import ZeroShotClassifier - -__all__ = [ - 'CenterNetRPNHead', 'Detic', 'DeticBBoxHead', 'DeticRoIHead', - 'ZeroShotClassifier', 'HeatmapFocalLoss', 'ImageNetLVISV1Dataset' -] diff --git a/projects/Detic_new/detic/centernet_rpn_head.py b/projects/Detic_new/detic/centernet_rpn_head.py deleted file mode 100644 index 629872824..000000000 --- a/projects/Detic_new/detic/centernet_rpn_head.py +++ /dev/null @@ -1,573 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import copy -from typing import Dict, List, Optional, Sequence, Tuple - -import torch -import torch.nn as nn -from mmcv.cnn import Scale -from mmengine import ConfigDict -from mmengine.structures import InstanceData -from torch import Tensor - -from mmdet.models.dense_heads import CenterNetUpdateHead -from mmdet.models.utils import unpack_gt_instances -from mmdet.registry import MODELS -from mmdet.structures import SampleList -from mmdet.structures.bbox import bbox2distance -from mmdet.utils import (ConfigType, InstanceList, OptConfigType, - OptInstanceList, reduce_mean) -from .iou_loss import IOULoss - -# from .heatmap_focal_loss import binary_heatmap_focal_loss_jit -INF = 1000000000 -RangeType = Sequence[Tuple[int, int]] - - -@MODELS.register_module() -class CenterNetRPNHead(CenterNetUpdateHead): - """CenterNetUpdateHead is an improved version of CenterNet in CenterNet2. - - Paper link ``_. - Args: - num_classes (int): Number of categories excluding the background - category. - in_channels (int): Number of channel in the input feature map. - regress_ranges (Sequence[Tuple[int, int]]): Regress range of multiple - level points. - hm_min_radius (int): Heatmap target minimum radius of cls branch. - Defaults to 4. - hm_min_overlap (float): Heatmap target minimum overlap of cls branch. - Defaults to 0.8. - more_pos_thresh (float): The filtering threshold when the cls branch - adds more positive samples. Defaults to 0.2. - more_pos_topk (int): The maximum number of additional positive samples - added to each gt. Defaults to 9. - soft_weight_on_reg (bool): Whether to use the soft target of the - cls branch as the soft weight of the bbox branch. - Defaults to False. - loss_cls (:obj:`ConfigDict` or dict): Config of cls loss. Defaults to - dict(type='GaussianFocalLoss', loss_weight=1.0) - loss_bbox (:obj:`ConfigDict` or dict): Config of bbox loss. Defaults to - dict(type='GIoULoss', loss_weight=2.0). - norm_cfg (:obj:`ConfigDict` or dict, optional): dictionary to construct - and config norm layer. Defaults to - ``norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)``. - train_cfg (:obj:`ConfigDict` or dict, optional): Training config. - Unused in CenterNet. Reserved for compatibility with - SingleStageDetector. - test_cfg (:obj:`ConfigDict` or dict, optional): Testing config - of CenterNet. - """ - - def __init__(self, - num_classes: int, - in_channels: int, - regress_ranges: RangeType = ((0, 80), (64, 160), (128, 320), - (256, 640), (512, INF)), - hm_min_radius: int = 4, - hm_min_overlap: float = 0.8, - more_pos: bool = False, - more_pos_thresh: float = 0.2, - more_pos_topk: int = 9, - soft_weight_on_reg: bool = False, - not_clamp_box: bool = False, - loss_cls: ConfigType = dict( - type='HeatmapFocalLoss', - alpha=0.25, - beta=4.0, - gamma=2.0, - pos_weight=1.0, - neg_weight=1.0, - sigmoid_clamp=1e-4, - ignore_high_fp=-1.0, - loss_weight=1.0, - ), - loss_bbox: ConfigType = dict( - type='GIoULoss', loss_weight=2.0), - norm_cfg: OptConfigType = dict( - type='GN', num_groups=32, requires_grad=True), - train_cfg: OptConfigType = None, - test_cfg: OptConfigType = None, - **kwargs) -> None: - super().__init__( - num_classes=num_classes, - in_channels=in_channels, - # loss_bbox=loss_bbox, - loss_cls=loss_cls, - norm_cfg=norm_cfg, - train_cfg=train_cfg, - test_cfg=test_cfg, - **kwargs) - self.soft_weight_on_reg = soft_weight_on_reg - self.hm_min_radius = hm_min_radius - self.more_pos_thresh = more_pos_thresh - self.more_pos_topk = more_pos_topk - self.more_pos = more_pos - self.not_clamp_box = not_clamp_box - self.delta = (1 - hm_min_overlap) / (1 + hm_min_overlap) - self.loss_bbox = IOULoss('giou') - - # GaussianFocalLoss must be sigmoid mode - self.use_sigmoid_cls = True - self.cls_out_channels = num_classes - - self.regress_ranges = regress_ranges - self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides]) - - def _init_layers(self) -> None: - """Initialize layers of the head.""" - self._init_reg_convs() - self._init_predictor() - - def forward_single(self, x: Tensor, scale: Scale, - stride: int) -> Tuple[Tensor, Tensor]: - """Forward features of a single scale level. - - Args: - x (Tensor): FPN feature maps of the specified stride. - scale (:obj:`mmcv.cnn.Scale`): Learnable scale module to resize - the bbox prediction. - stride (int): The corresponding stride for feature maps. - - Returns: - tuple: scores for each class, bbox predictions of - input feature maps. - """ - for m in self.reg_convs: - x = m(x) - cls_score = self.conv_cls(x) - bbox_pred = self.conv_reg(x) - # scale the bbox_pred of different level - # float to avoid overflow when enabling FP16 - bbox_pred = scale(bbox_pred).float() - # bbox_pred needed for gradient computation has been modified - # by F.relu(bbox_pred) when run with PyTorch 1.10. So replace - # F.relu(bbox_pred) with bbox_pred.clamp(min=0) - bbox_pred = bbox_pred.clamp(min=0) - return cls_score, bbox_pred # score aligned, box larger - - def loss_by_feat( - self, - cls_scores: List[Tensor], - bbox_preds: List[Tensor], - batch_gt_instances: InstanceList, - batch_img_metas: List[dict], - batch_gt_instances_ignore: OptInstanceList = None - ) -> Dict[str, Tensor]: - """Calculate the loss based on the features extracted by the detection - head. - - Args: - cls_scores (list[Tensor]): Box scores for each scale level, - each is a 4D-tensor, the channel number is num_classes. - bbox_preds (list[Tensor]): Box energies / deltas for each scale - level, each is a 4D-tensor, the channel number is 4. - batch_gt_instances (list[:obj:`InstanceData`]): Batch of - gt_instance. It usually includes ``bboxes`` and ``labels`` - attributes. - batch_img_metas (list[dict]): Meta information of each image, e.g., - image size, scaling factor, etc. - batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): - Batch of gt_instances_ignore. It includes ``bboxes`` attribute - data that is ignored during training and testing. - Defaults to None. - - Returns: - dict[str, Tensor]: A dictionary of loss components. - """ - - num_imgs = cls_scores[0].size(0) - assert len(cls_scores) == len(bbox_preds) - featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] - all_level_points = self.prior_generator.grid_priors( - featmap_sizes, - dtype=bbox_preds[0].dtype, - device=bbox_preds[0].device) - - # 1 flatten outputs - flatten_cls_scores = [ - cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels) - for cls_score in cls_scores - ] - flatten_bbox_preds = [ - bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4) - for bbox_pred in bbox_preds - ] - flatten_cls_scores = torch.cat(flatten_cls_scores) - flatten_bbox_preds = torch.cat(flatten_bbox_preds) - - # repeat points to align with bbox_preds - flatten_points = torch.cat( - [points.repeat(num_imgs, 1) for points in all_level_points]) - - assert (torch.isfinite(flatten_bbox_preds).all().item()) - - # 2 calc reg and cls branch targets - cls_targets, bbox_targets = self.get_targets(all_level_points, - batch_gt_instances) - - # 3 pos index for cls branch - featmap_sizes = flatten_points.new_tensor(featmap_sizes) - - if self.more_pos: - pos_inds, cls_labels = self.add_cls_pos_inds( - flatten_points, flatten_bbox_preds, featmap_sizes, - batch_gt_instances) - else: - pos_inds = self._get_label_inds(batch_gt_instances, - batch_img_metas, featmap_sizes) - - # 4 calc cls loss - if pos_inds is None: - # num_gts=0 - num_pos_cls = bbox_preds[0].new_tensor(0, dtype=torch.float) - else: - num_pos_cls = bbox_preds[0].new_tensor( - len(pos_inds), dtype=torch.float) - num_pos_cls = max(reduce_mean(num_pos_cls), 1.0) - - cat_agn_cls_targets = cls_targets.max(dim=1)[0] # M - - cls_pos_loss, cls_neg_loss = self.loss_cls( - flatten_cls_scores.squeeze(1), cat_agn_cls_targets, pos_inds, - num_pos_cls) - - # 5 calc reg loss - pos_bbox_inds = torch.nonzero( - bbox_targets.max(dim=1)[0] >= 0).squeeze(1) - pos_bbox_preds = flatten_bbox_preds[pos_bbox_inds] - pos_bbox_targets = bbox_targets[pos_bbox_inds] - - bbox_weight_map = cls_targets.max(dim=1)[0] - bbox_weight_map = bbox_weight_map[pos_bbox_inds] - bbox_weight_map = bbox_weight_map if self.soft_weight_on_reg \ - else torch.ones_like(bbox_weight_map) - - num_pos_bbox = max(reduce_mean(bbox_weight_map.sum()), 1.0) - - if len(pos_bbox_inds) > 0: - bbox_loss = self.loss_bbox( - pos_bbox_preds, - pos_bbox_targets, - bbox_weight_map, - reduction='sum') / num_pos_bbox - else: - bbox_loss = flatten_bbox_preds.sum() * 0 - - return dict( - loss_bbox=bbox_loss, - loss_cls_pos=cls_pos_loss, - loss_cls_neg=cls_neg_loss) - - def loss_and_predict( - self, - x: Tuple[Tensor], - batch_data_samples: SampleList, - proposal_cfg: Optional[ConfigDict] = None - ) -> Tuple[dict, InstanceList]: - """Perform forward propagation of the head, then calculate loss and - predictions from the features and data samples. - - Args: - x (tuple[Tensor]): Features from FPN. - batch_data_samples (list[:obj:`DetDataSample`]): Each item contains - the meta information of each image and corresponding - annotations. - proposal_cfg (ConfigDict, optional): Test / postprocessing - configuration, if None, test_cfg would be used. - Defaults to None. - - Returns: - tuple: the return value is a tuple contains: - - - losses: (dict[str, Tensor]): A dictionary of loss components. - - predictions (list[:obj:`InstanceData`]): Detection - results of each image after the post process. - """ - outputs = unpack_gt_instances(batch_data_samples) - (batch_gt_instances, batch_gt_instances_ignore, - batch_img_metas) = outputs - - outs = self(x) - - loss_inputs = outs + (batch_gt_instances, batch_img_metas, - batch_gt_instances_ignore) - losses = self.loss_by_feat(*loss_inputs) - predictions = self.predict_by_feat( - *outs, batch_img_metas=batch_img_metas, cfg=proposal_cfg) - return losses, predictions - - def _predict_by_feat_single(self, - cls_score_list: List[Tensor], - bbox_pred_list: List[Tensor], - score_factor_list: List[Tensor], - mlvl_priors: List[Tensor], - img_meta: dict, - cfg: ConfigDict, - rescale: bool = False, - with_nms: bool = True) -> InstanceData: - """Transform a single image's features extracted from the head into - bbox results. - - Args: - cls_score_list (list[Tensor]): Box scores from all scale - levels of a single image, each item has shape - (num_priors * num_classes, H, W). - bbox_pred_list (list[Tensor]): Box energies / deltas from - all scale levels of a single image, each item has shape - (num_priors * 4, H, W). - score_factor_list (list[Tensor]): Score factor from all scale - levels of a single image, each item has shape - (num_priors * 1, H, W). - mlvl_priors (list[Tensor]): Each element in the list is - the priors of a single level in feature pyramid. In all - anchor-based methods, it has shape (num_priors, 4). In - all anchor-free methods, it has shape (num_priors, 2) - when `with_stride=True`, otherwise it still has shape - (num_priors, 4). - img_meta (dict): Image meta info. - cfg (mmengine.Config): Test / postprocessing configuration, - if None, test_cfg would be used. - rescale (bool): If True, return boxes in original image space. - Defaults to False. - with_nms (bool): If True, do nms before return boxes. - Defaults to True. - - Returns: - :obj:`InstanceData`: Detection results of each image - after the post process. - Each item usually contains following keys. - - - scores (Tensor): Classification scores, has a shape - (num_instance, ) - - labels (Tensor): Labels of bboxes, has a shape - (num_instances, ). - - bboxes (Tensor): Has a shape (num_instances, 4), - the last dimension 4 arrange as (x1, y1, x2, y2). - """ - - cfg = self.test_cfg if cfg is None else cfg - cfg = copy.deepcopy(cfg) - nms_pre = cfg.get('nms_pre', -1) - - mlvl_bbox_preds = [] - mlvl_valid_priors = [] - mlvl_scores = [] - mlvl_labels = [] - - for level_idx, (cls_score, bbox_pred, score_factor, priors) in \ - enumerate(zip(cls_score_list, bbox_pred_list, - score_factor_list, mlvl_priors)): - - assert cls_score.size()[-2:] == bbox_pred.size()[-2:] - - bbox_pred = bbox_pred * self.strides[level_idx] - - dim = self.bbox_coder.encode_size - bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, dim) - cls_score = cls_score.permute(1, 2, - 0).reshape(-1, self.cls_out_channels) - heatmap = cls_score.sigmoid() - score_thr = cfg.get('score_thr', 0) - - candidate_inds = heatmap > score_thr # 0.05 - pre_nms_top_n = candidate_inds.sum() # N - pre_nms_top_n = pre_nms_top_n.clamp(max=nms_pre) # N - - heatmap = heatmap[candidate_inds] # n - - candidate_nonzeros = candidate_inds.nonzero() # n - box_loc = candidate_nonzeros[:, 0] # n - labels = candidate_nonzeros[:, 1] # n - - bbox_pred = bbox_pred[box_loc] # n x 4 - per_grids = priors[box_loc] # n x 2 - - if candidate_inds.sum().item() > pre_nms_top_n.item(): - heatmap, top_k_indices = \ - heatmap.topk(pre_nms_top_n, sorted=False) - labels = labels[top_k_indices] - bbox_pred = bbox_pred[top_k_indices] - per_grids = per_grids[top_k_indices] - - bboxes = torch.stack([ - per_grids[:, 0] - bbox_pred[:, 0], - per_grids[:, 1] - bbox_pred[:, 1], - per_grids[:, 0] + bbox_pred[:, 2], - per_grids[:, 1] + bbox_pred[:, 3], - ], - dim=1) # n x 4 - - # avoid invalid boxes in RoI heads - bboxes[:, 2] = torch.max(bboxes[:, 2], bboxes[:, 0] + 0.01) - bboxes[:, 3] = torch.max(bboxes[:, 3], bboxes[:, 1] + 0.01) - - # bboxes = self.bbox_coder.decode(per_grids, bbox_pred) - # # avoid invalid boxes in RoI heads - # bboxes[:, 2] = torch.max(bboxes[:, 2], bboxes[:, 0] + 0.01) - # bboxes[:, 3] = torch.max(bboxes[:, 3], bboxes[:, 1] + 0.01) - - mlvl_bbox_preds.append(bboxes) - mlvl_valid_priors.append(priors) - mlvl_scores.append(torch.sqrt(heatmap)) - mlvl_labels.append(labels) - - results = InstanceData() - results.bboxes = torch.cat(mlvl_bbox_preds) - results.scores = torch.cat(mlvl_scores) - results.labels = torch.cat(mlvl_labels) - - return self._bbox_post_process( - results=results, - cfg=cfg, - rescale=rescale, - with_nms=with_nms, - img_meta=img_meta) - - def _get_label_inds(self, batch_gt_instances, batch_img_metas, - shapes_per_level): - ''' - Inputs: - batch_gt_instances: [n_i], sum n_i = N - shapes_per_level: L x 2 [(h_l, w_l)]_L - Returns: - pos_inds: N' - labels: N' - ''' - pos_inds = [] - L = len(self.strides) - B = len(batch_gt_instances) - shapes_per_level = shapes_per_level.long() - loc_per_level = (shapes_per_level[:, 0] * - shapes_per_level[:, 1]).long() # L - level_bases = [] - s = 0 - for i in range(L): - level_bases.append(s) - s = s + B * loc_per_level[i] - level_bases = shapes_per_level.new_tensor(level_bases).long() # L - strides_default = shapes_per_level.new_tensor( - self.strides).float() # L - for im_i in range(B): - targets_per_im = batch_gt_instances[im_i] - if hasattr(targets_per_im, 'bboxes'): - bboxes = targets_per_im.bboxes # n x 4 - else: - bboxes = targets_per_im.labels.new_tensor( - [], dtype=torch.float).reshape(-1, 4) - n = bboxes.shape[0] - centers = ((bboxes[:, [0, 1]] + bboxes[:, [2, 3]]) / 2) # n x 2 - centers = centers.view(n, 1, 2).expand(n, L, 2).contiguous() - if self.not_clamp_box: - h, w = batch_img_metas[im_i]._image_size - centers[:, :, 0].clamp_(min=0).clamp_(max=w - 1) - centers[:, :, 1].clamp_(min=0).clamp_(max=h - 1) - strides = strides_default.view(1, L, 1).expand(n, L, 2) - centers_inds = (centers / strides).long() # n x L x 2 - Ws = shapes_per_level[:, 1].view(1, L).expand(n, L) - pos_ind = level_bases.view(1, L).expand(n, L) \ - + im_i * loc_per_level.view(1, L).expand(n, L) \ - + centers_inds[:, :, 1] * Ws + centers_inds[:, :, 0] # n x L - is_cared_in_the_level = self.assign_fpn_level(bboxes) - pos_ind = pos_ind[is_cared_in_the_level].view(-1) - - pos_inds.append(pos_ind) # n' - pos_inds = torch.cat(pos_inds, dim=0).long() - return pos_inds # N, N - - def assign_fpn_level(self, boxes): - ''' - Inputs: - boxes: n x 4 - size_ranges: L x 2 - Return: - is_cared_in_the_level: n x L - ''' - size_ranges = boxes.new_tensor(self.regress_ranges).view( - len(self.regress_ranges), 2) # L x 2 - crit = ((boxes[:, 2:] - boxes[:, :2])**2).sum(dim=1)**0.5 / 2 # n - n, L = crit.shape[0], size_ranges.shape[0] - crit = crit.view(n, 1).expand(n, L) - size_ranges_expand = size_ranges.view(1, L, 2).expand(n, L, 2) - is_cared_in_the_level = (crit >= size_ranges_expand[:, :, 0]) & \ - (crit <= size_ranges_expand[:, :, 1]) - return is_cared_in_the_level - - def _get_targets_single(self, gt_instances: InstanceData, points: Tensor, - regress_ranges: Tensor, - strides: Tensor) -> Tuple[Tensor, Tensor]: - """Compute classification and bbox targets for a single image.""" - num_points = points.size(0) - num_gts = len(gt_instances) - gt_labels = gt_instances.labels - - if not hasattr(gt_instances, 'bboxes'): - gt_bboxes = gt_labels.new_tensor([], dtype=torch.float) - else: - gt_bboxes = gt_instances.bboxes - - if not hasattr(gt_instances, 'bboxes') or num_gts == 0: - return gt_labels.new_full((num_points, - self.num_classes), - self.num_classes, - dtype=torch.float), \ - gt_bboxes.new_full((num_points, 4), -1) - - # Calculate the regression tblr target corresponding to all points - points = points[:, None].expand(num_points, num_gts, 2) - gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4) - strides = strides[:, None, None].expand(num_points, num_gts, 2) - - bbox_target = bbox2distance(points, gt_bboxes) # M x N x 4 - - # condition1: inside a gt bbox - inside_gt_bbox_mask = bbox_target.min(dim=2)[0] > 0 # M x N - - # condition2: Calculate the nearest points from - # the upper, lower, left and right ranges from - # the center of the gt bbox - centers = ((gt_bboxes[..., [0, 1]] + gt_bboxes[..., [2, 3]]) / 2) - centers_discret = ((centers / strides).int() * strides).float() + \ - strides / 2 - - centers_discret_dist = points - centers_discret - dist_x = centers_discret_dist[..., 0].abs() - dist_y = centers_discret_dist[..., 1].abs() - inside_gt_center3x3_mask = (dist_x <= strides[..., 0]) & \ - (dist_y <= strides[..., 0]) - - # condition3: limit the regression range for each location - bbox_target_wh = bbox_target[..., :2] + bbox_target[..., 2:] - crit = (bbox_target_wh**2).sum(dim=2)**0.5 / 2 - inside_fpn_level_mask = (crit >= regress_ranges[:, [0]]) & \ - (crit <= regress_ranges[:, [1]]) - bbox_target_mask = inside_gt_bbox_mask & \ - inside_gt_center3x3_mask & \ - inside_fpn_level_mask - - # Calculate the distance weight map - gt_center_peak_mask = ((centers_discret_dist**2).sum(dim=2) == 0) - weighted_dist = ((points - centers)**2).sum(dim=2) # M x N - weighted_dist[gt_center_peak_mask] = 0 - - areas = (gt_bboxes[..., 2] - gt_bboxes[..., 0]) * ( - gt_bboxes[..., 3] - gt_bboxes[..., 1]) - radius = self.delta**2 * 2 * areas - radius = torch.clamp(radius, min=self.hm_min_radius**2) - weighted_dist = weighted_dist / radius - - # Calculate bbox_target - bbox_weighted_dist = weighted_dist.clone() - bbox_weighted_dist[bbox_target_mask == 0] = INF * 1.0 - min_dist, min_inds = bbox_weighted_dist.min(dim=1) - bbox_target = bbox_target[range(len(bbox_target)), - min_inds] # M x N x 4 --> M x 4 - bbox_target[min_dist == INF] = -INF - - # Convert to feature map scale - bbox_target /= strides[:, 0, :].repeat(1, 2) - - # Calculate cls_target - cls_target = self._create_heatmaps_from_dist(weighted_dist, gt_labels) - - return cls_target, bbox_target diff --git a/projects/Detic_new/detic/detic.py b/projects/Detic_new/detic/detic.py deleted file mode 100644 index 7028690ac..000000000 --- a/projects/Detic_new/detic/detic.py +++ /dev/null @@ -1,274 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import copy -from typing import List, Union - -import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F -from mmengine.logging import print_log -from torch import Tensor - -from mmdet.datasets import LVISV1Dataset -from mmdet.models.detectors.cascade_rcnn import CascadeRCNN -from mmdet.registry import MODELS -from mmdet.structures import SampleList - - -class CLIPTextEncoder(nn.Module): - - def __init__(self, model_name='ViT-B/32'): - super().__init__() - import clip - from clip.simple_tokenizer import SimpleTokenizer - self.tokenizer = SimpleTokenizer() - pretrained_model, _ = clip.load(model_name, device='cpu') - self.clip = pretrained_model - - @property - def device(self): - return self.clip.device - - @property - def dtype(self): - return self.clip.dtype - - def tokenize(self, - texts: Union[str, List[str]], - context_length: int = 77) -> torch.LongTensor: - if isinstance(texts, str): - texts = [texts] - - sot_token = self.tokenizer.encoder['<|startoftext|>'] - eot_token = self.tokenizer.encoder['<|endoftext|>'] - all_tokens = [[sot_token] + self.tokenizer.encode(text) + [eot_token] - for text in texts] - result = torch.zeros(len(all_tokens), context_length, dtype=torch.long) - - for i, tokens in enumerate(all_tokens): - if len(tokens) > context_length: - st = torch.randint(len(tokens) - context_length + 1, - (1, ))[0].item() - tokens = tokens[st:st + context_length] - result[i, :len(tokens)] = torch.tensor(tokens) - - return result - - def forward(self, text): - text = self.tokenize(text) - text_features = self.clip.encode_text(text) - return text_features - - -def get_class_weight(original_caption, prompt_prefix='a '): - if isinstance(original_caption, str): - if original_caption == 'coco': - from mmdet.datasets import CocoDataset - class_names = CocoDataset.METAINFO['classes'] - elif original_caption == 'cityscapes': - from mmdet.datasets import CityscapesDataset - class_names = CityscapesDataset.METAINFO['classes'] - elif original_caption == 'voc': - from mmdet.datasets import VOCDataset - class_names = VOCDataset.METAINFO['classes'] - elif original_caption == 'openimages': - from mmdet.datasets import OpenImagesDataset - class_names = OpenImagesDataset.METAINFO['classes'] - elif original_caption == 'lvis': - from mmdet.datasets import LVISV1Dataset - class_names = LVISV1Dataset.METAINFO['classes'] - else: - if not original_caption.endswith('.'): - original_caption = original_caption + ' . ' - original_caption = original_caption.split(' . ') - class_names = list(filter(lambda x: len(x) > 0, original_caption)) - - # for test.py - else: - class_names = list(original_caption) - - text_encoder = CLIPTextEncoder() - text_encoder.eval() - texts = [prompt_prefix + x for x in class_names] - print_log(f'Computing text embeddings for {len(class_names)} classes.') - embeddings = text_encoder(texts).detach().permute(1, 0).contiguous().cpu() - return class_names, embeddings - - -def reset_cls_layer_weight(roi_head, weight): - if type(weight) == str: - print_log(f'Resetting cls_layer_weight from file: {weight}') - zs_weight = torch.tensor( - np.load(weight), - dtype=torch.float32).permute(1, 0).contiguous() # D x C - else: - zs_weight = weight - zs_weight = torch.cat( - [zs_weight, zs_weight.new_zeros( - (zs_weight.shape[0], 1))], dim=1) # D x (C + 1) - zs_weight = F.normalize(zs_weight, p=2, dim=0) - zs_weight = zs_weight.to('cuda') - num_classes = zs_weight.shape[-1] - - for bbox_head in roi_head.bbox_head: - bbox_head.num_classes = num_classes - del bbox_head.fc_cls.zs_weight - bbox_head.fc_cls.zs_weight = zs_weight - - -@MODELS.register_module() -class Detic(CascadeRCNN): - - def __init__(self, - with_image_labels: bool = False, - sync_caption_batch: bool = False, - fp16: bool = False, - roi_head_name: str = '', - cap_batch_ratio: int = 4, - with_caption: bool = False, - dynamic_classifier: bool = False, - **kwargs) -> None: - super().__init__(**kwargs) - - self._entities = LVISV1Dataset.METAINFO['classes'] - self._text_prompts = None - # Turn on co-training with classification data - self.with_image_labels = with_image_labels - # Caption losses - self.with_caption = with_caption - # synchronize across GPUs to enlarge # "classes" - self.sync_caption_batch = sync_caption_batch - # Ratio between detection data and caption data - self.cap_batch_ratio = cap_batch_ratio - self.fp16 = fp16 - self.roi_head_name = roi_head_name - # dynamic class sampling when training with 21K classes, - # Federated loss is enabled when DYNAMIC_CLASSIFIER is on - self.dynamic_classifier = dynamic_classifier - self.return_proposal = False - if self.dynamic_classifier: - self.freq_weight = kwargs.pop('freq_weight') - self.num_classes = kwargs.pop('num_classes') - self.num_sample_cats = kwargs.pop('num_sample_cats') - - def loss(self, batch_inputs: Tensor, - batch_data_samples: SampleList) -> dict: - """Calculate losses from a batch of inputs and data samples. - - Args: - batch_inputs (Tensor): Input images of shape (N, C, H, W). - These should usually be mean centered and std scaled. - batch_data_samples (List[:obj:`DetDataSample`]): The batch - data samples. It usually includes information such - as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. - - Returns: - dict: A dictionary of loss components - """ - - x = self.extract_feat(batch_inputs) - losses = dict() - - # RPN forward and loss - if self.with_rpn: - proposal_cfg = self.train_cfg.get('rpn_proposal', - self.test_cfg.rpn) - rpn_data_samples = copy.deepcopy(batch_data_samples) - # set cat_id of gt_labels to 0 in RPN - for data_sample in rpn_data_samples: - data_sample.gt_instances.labels = \ - torch.zeros_like(data_sample.gt_instances.labels) - - rpn_losses, rpn_results_list = self.rpn_head.loss_and_predict( - x, rpn_data_samples, proposal_cfg=proposal_cfg) - - # avoid get same name with roi_head loss - keys = rpn_losses.keys() - for key in list(keys): - if 'loss' in key and 'rpn' not in key: - rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key) - losses.update(rpn_losses) - # if not hasattr(batch_data_samples[0].gt_instances, 'bboxes'): - # losses.update({k: v * 0 for k, v in rpn_losses.items()}) - # else: - # losses.update(rpn_losses) - else: - assert batch_data_samples[0].get('proposals', None) is not None - # use pre-defined proposals in InstanceData for the second stage - # to extract ROI features. - rpn_results_list = [ - data_sample.proposals for data_sample in batch_data_samples - ] - - roi_losses = self.roi_head.loss(x, rpn_results_list, - batch_data_samples) - - losses.update(roi_losses) - - return losses - - def predict(self, - batch_inputs: Tensor, - batch_data_samples: SampleList, - rescale: bool = True) -> SampleList: - """Predict results from a batch of inputs and data samples with post- - processing. - - Args: - batch_inputs (Tensor): Inputs with shape (N, C, H, W). - batch_data_samples (List[:obj:`DetDataSample`]): The Data - Samples. It usually includes information such as - `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. - rescale (bool): Whether to rescale the results. - Defaults to True. - - Returns: - list[:obj:`DetDataSample`]: Return the detection results of the - input images. The returns value is DetDataSample, - which usually contain 'pred_instances'. And the - ``pred_instances`` usually contains following keys. - - - scores (Tensor): Classification scores, has a shape - (num_instance, ) - - labels (Tensor): Labels of bboxes, has a shape - (num_instances, ). - - bboxes (Tensor): Has a shape (num_instances, 4), - the last dimension 4 arrange as (x1, y1, x2, y2). - - masks (Tensor): Has a shape (num_instances, H, W). - """ - # For single image inference - if 'custom_entities' in batch_data_samples[0]: - text_prompts = batch_data_samples[0].text - if text_prompts != self._text_prompts: - self._text_prompts = text_prompts - class_names, zs_weight = get_class_weight(text_prompts) - self._entities = class_names - reset_cls_layer_weight(self.roi_head, zs_weight) - - assert self.with_bbox, 'Bbox head must be implemented.' - - x = self.extract_feat(batch_inputs) - - # If there are no pre-defined proposals, use RPN to get proposals - if batch_data_samples[0].get('proposals', None) is None: - rpn_results_list = self.rpn_head.predict( - x, batch_data_samples, rescale=False) - else: - rpn_results_list = [ - data_sample.proposals for data_sample in batch_data_samples - ] - - results_list = self.roi_head.predict( - x, rpn_results_list, batch_data_samples, rescale=rescale) - - for data_sample, pred_instances in zip(batch_data_samples, - results_list): - if len(pred_instances) > 0: - label_names = [] - for labels in pred_instances.labels: - label_names.append(self._entities[labels]) - # for visualization - pred_instances.label_names = label_names - data_sample.pred_instances = pred_instances - - return batch_data_samples diff --git a/projects/Detic_new/detic/detic_bbox_head.py b/projects/Detic_new/detic/detic_bbox_head.py deleted file mode 100644 index 8779494ba..000000000 --- a/projects/Detic_new/detic/detic_bbox_head.py +++ /dev/null @@ -1,434 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import json -from typing import List, Optional - -import torch -from mmengine.config import ConfigDict -from mmengine.structures import InstanceData -from torch import Tensor -from torch.nn import functional as F - -from mmdet.models.layers import multiclass_nms -from mmdet.models.losses import accuracy -from mmdet.models.roi_heads.bbox_heads import Shared2FCBBoxHead -from mmdet.models.utils import empty_instances -from mmdet.registry import MODELS -from mmdet.structures.bbox import get_box_tensor, scale_boxes -from mmdet.utils import ConfigType, InstanceList - - -def load_class_freq(path='datasets/metadata/lvis_v1_train_cat_info.json', - freq_weight=0.5): - cat_info = json.load(open(path, 'r')) - cat_info = torch.tensor( - [c['image_count'] for c in sorted(cat_info, key=lambda x: x['id'])]) - freq_weight = cat_info.float()**freq_weight - return freq_weight - - -def get_fed_loss_inds(labels, num_sample_cats, C, weight=None): - - appeared = torch.unique(labels) # C' - prob = appeared.new_ones(C + 1).float() - prob[-1] = 0 - if len(appeared) < num_sample_cats: - if weight is not None: - prob[:C] = weight.float().clone() - prob[appeared] = 0 - more_appeared = torch.multinomial( - prob, num_sample_cats - len(appeared), replacement=False) - appeared = torch.cat([appeared, more_appeared]) - return appeared - - -@MODELS.register_module() -class DeticBBoxHead(Shared2FCBBoxHead): - - def __init__(self, - image_loss_weight: float = 0.1, - use_fed_loss: bool = False, - cat_freq_path: str = '', - fed_loss_freq_weight: float = 0.5, - fed_loss_num_cat: int = 50, - cls_predictor_cfg: ConfigType = dict( - type='ZeroShotClassifier'), - *args, - **kwargs) -> None: - super().__init__(*args, **kwargs) - # reconstruct fc_cls and fc_reg since input channels are changed - assert self.with_cls - - self.cls_predictor_cfg = cls_predictor_cfg - cls_channels = self.num_classes - self.cls_predictor_cfg.update( - in_features=self.cls_last_dim, out_features=cls_channels) - self.fc_cls = MODELS.build(self.cls_predictor_cfg) - - self.init_cfg += [ - dict(type='Caffe2Xavier', override=dict(name='reg_fcs')) - ] - - self.image_loss_weight = image_loss_weight - self.use_fed_loss = use_fed_loss - self.cat_freq_path = cat_freq_path - self.fed_loss_freq_weight = fed_loss_freq_weight - self.fed_loss_num_cat = fed_loss_num_cat - - if self.use_fed_loss: - freq_weight = load_class_freq(cat_freq_path, fed_loss_freq_weight) - self.register_buffer('freq_weight', freq_weight) - else: - self.freq_weight = None - - def _predict_by_feat_single( - self, - roi: Tensor, - cls_score: Tensor, - bbox_pred: Tensor, - img_meta: dict, - rescale: bool = False, - rcnn_test_cfg: Optional[ConfigDict] = None) -> InstanceData: - """Transform a single image's features extracted from the head into - bbox results. - - Args: - roi (Tensor): Boxes to be transformed. Has shape (num_boxes, 5). - last dimension 5 arrange as (batch_index, x1, y1, x2, y2). - cls_score (Tensor): Box scores, has shape - (num_boxes, num_classes + 1). - bbox_pred (Tensor): Box energies / deltas. - has shape (num_boxes, num_classes * 4). - img_meta (dict): image information. - rescale (bool): If True, return boxes in original image space. - Defaults to False. - rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of Bbox Head. - Defaults to None - - Returns: - :obj:`InstanceData`: Detection results of each image\ - Each item usually contains following keys. - - - scores (Tensor): Classification scores, has a shape - (num_instance, ) - - labels (Tensor): Labels of bboxes, has a shape - (num_instances, ). - - bboxes (Tensor): Has a shape (num_instances, 4), - the last dimension 4 arrange as (x1, y1, x2, y2). - """ - results = InstanceData() - if roi.shape[0] == 0: - return empty_instances([img_meta], - roi.device, - task_type='bbox', - instance_results=[results], - box_type=self.predict_box_type, - use_box_type=False, - num_classes=self.num_classes, - score_per_cls=rcnn_test_cfg is None)[0] - scores = cls_score - img_shape = img_meta['img_shape'] - num_rois = roi.size(0) - - num_classes = 1 if self.reg_class_agnostic else self.num_classes - roi = roi.repeat_interleave(num_classes, dim=0) - bbox_pred = bbox_pred.view(-1, self.bbox_coder.encode_size) - bboxes = self.bbox_coder.decode( - roi[..., 1:], bbox_pred, max_shape=img_shape) - - if rescale and bboxes.size(0) > 0: - assert img_meta.get('scale_factor') is not None - scale_factor = [1 / s for s in img_meta['scale_factor']] - bboxes = scale_boxes(bboxes, scale_factor) - - # Get the inside tensor when `bboxes` is a box type - bboxes = get_box_tensor(bboxes) - box_dim = bboxes.size(-1) - bboxes = bboxes.view(num_rois, -1) - - if rcnn_test_cfg is None: - # This means that it is aug test. - # It needs to return the raw results without nms. - results.bboxes = bboxes - results.scores = scores - else: - det_bboxes, det_labels = multiclass_nms( - bboxes, - scores, - rcnn_test_cfg.score_thr, - rcnn_test_cfg.nms, - rcnn_test_cfg.max_per_img, - box_dim=box_dim) - results.bboxes = det_bboxes[:, :-1] - results.scores = det_bboxes[:, -1] - results.labels = det_labels - return results - - def loss(self, - cls_score: Tensor, - bbox_pred: Tensor, - rois: Tensor, - labels: Tensor, - label_weights: Tensor, - bbox_targets: Tensor, - bbox_weights: Tensor, - reduction_override: Optional[str] = None) -> dict: - """Calculate the loss based on the network predictions and targets. - - Args: - cls_score (Tensor): Classification prediction - results of all class, has shape - (batch_size * num_proposals_single_image, num_classes) - bbox_pred (Tensor): Regression prediction results, - has shape - (batch_size * num_proposals_single_image, 4), the last - dimension 4 represents [tl_x, tl_y, br_x, br_y]. - rois (Tensor): RoIs with the shape - (batch_size * num_proposals_single_image, 5) where the first - column indicates batch id of each RoI. - labels (Tensor): Gt_labels for all proposals in a batch, has - shape (batch_size * num_proposals_single_image, ). - label_weights (Tensor): Labels_weights for all proposals in a - batch, has shape (batch_size * num_proposals_single_image, ). - bbox_targets (Tensor): Regression target for all proposals in a - batch, has shape (batch_size * num_proposals_single_image, 4), - the last dimension 4 represents [tl_x, tl_y, br_x, br_y]. - bbox_weights (Tensor): Regression weights for all proposals in a - batch, has shape (batch_size * num_proposals_single_image, 4). - reduction_override (str, optional): The reduction - method used to override the original reduction - method of the loss. Options are "none", - "mean" and "sum". Defaults to None, - - Returns: - dict: A dictionary of loss. - """ - - losses = dict() - - if cls_score is not None: - - if cls_score.numel() > 0: - loss_cls_ = self.sigmoid_cross_entropy_loss(cls_score, labels) - if isinstance(loss_cls_, dict): - losses.update(loss_cls_) - else: - losses['loss_cls'] = loss_cls_ - if self.custom_activation: - acc_ = self.loss_cls.get_accuracy(cls_score, labels) - losses.update(acc_) - else: - losses['acc'] = accuracy(cls_score, labels) - if bbox_pred is not None: - bg_class_ind = self.num_classes - # 0~self.num_classes-1 are FG, self.num_classes is BG - pos_inds = (labels >= 0) & (labels < bg_class_ind) - # do not perform bounding box regression for BG anymore. - if pos_inds.any(): - if self.reg_decoded_bbox: - # When the regression loss (e.g. `IouLoss`, - # `GIouLoss`, `DIouLoss`) is applied directly on - # the decoded bounding boxes, it decodes the - # already encoded coordinates to absolute format. - bbox_pred = self.bbox_coder.decode(rois[:, 1:], bbox_pred) - bbox_pred = get_box_tensor(bbox_pred) - if self.reg_class_agnostic: - pos_bbox_pred = bbox_pred.view( - bbox_pred.size(0), -1)[pos_inds.type(torch.bool)] - else: - pos_bbox_pred = bbox_pred.view( - bbox_pred.size(0), self.num_classes, - -1)[pos_inds.type(torch.bool), - labels[pos_inds.type(torch.bool)]] - - losses['loss_bbox'] = self.loss_bbox( - pos_bbox_pred, - bbox_targets[pos_inds.type(torch.bool)], - bbox_weights[pos_inds.type(torch.bool)], - avg_factor=bbox_targets.size(0), - reduction_override=reduction_override) - else: - losses['loss_bbox'] = bbox_pred[pos_inds].sum() - return losses - - def sigmoid_cross_entropy_loss(self, cls_score, labels): - if cls_score.numel() == 0: - return cls_score.new_zeros( - [1])[0] # This is more robust than .sum() * 0. - B = cls_score.shape[0] - C = cls_score.shape[1] - 1 - - target = cls_score.new_zeros(B, C + 1) - target[range(len(labels)), labels] = 1 # B x (C + 1) - target = target[:, :C] # B x C - - weight = 1 - if self.use_fed_loss and (self.freq_weight is not None): # fedloss - appeared = get_fed_loss_inds( - labels, - num_sample_cats=self.fed_loss_num_cat, - C=C, - weight=self.freq_weight) - appeared_mask = appeared.new_zeros(C + 1) - appeared_mask[appeared] = 1 # C + 1 - appeared_mask = appeared_mask[:C] - fed_w = appeared_mask.view(1, C).expand(B, C) - weight = weight * fed_w.float() - # if self.ignore_zero_cats and (self.freq_weight is not None): - # w = (self.freq_weight.view(-1) > 1e-4).float() - # weight = weight * w.view(1, C).expand(B, C) - # # import pdb; pdb.set_trace() - - cls_loss = F.binary_cross_entropy_with_logits( - cls_score[:, :-1], target, reduction='none') # B x C - loss = torch.sum(cls_loss * weight) / B - return loss - - def image_label_losses(self, cls_score, sampling_results, image_labels): - ''' - Inputs: - cls_score: N x (C + 1) - image_labels B x 1 - ''' - num_inst_per_image = [ - len(pred_instances) for pred_instances in sampling_results - ] - cls_score = cls_score.split( - num_inst_per_image, dim=0) # B x n x (C + 1) - B = len(cls_score) - loss = cls_score[0].new_zeros([1])[0] - for (score, labels, pred_instances) in zip(cls_score, image_labels, - sampling_results): - if score.shape[0] == 0: - loss += score.new_zeros([1])[0] - continue - # find out max-size idx - bboxes = pred_instances.bboxes - areas = (bboxes[:, 2] - bboxes[:, 0]) * ( - bboxes[:, 3] - bboxes[:, 1]) - idx = areas[:-1].argmax().item() if len(areas) > 1 else 0 - - for label in labels: - target = score.new_zeros(score.shape[1]) - target[label] = 1 - loss_i = F.binary_cross_entropy_with_logits( - score[idx], target, reduction='sum') - loss += loss_i / len(labels) - loss = loss / B - - return loss * self.image_loss_weight - - def refine_bboxes(self, bbox_results: dict, - batch_img_metas: List[dict]) -> InstanceList: - """Refine bboxes during training. - - Args: - bbox_results (dict): Usually is a dictionary with keys: - - - `cls_score` (Tensor): Classification scores. - - `bbox_pred` (Tensor): Box energies / deltas. - - `rois` (Tensor): RoIs with the shape (n, 5) where the first - column indicates batch id of each RoI. - - `bbox_targets` (tuple): Ground truth for proposals in a - single image. Containing the following list of Tensors: - (labels, label_weights, bbox_targets, bbox_weights) - batch_img_metas (List[dict]): List of image information. - - Returns: - list[:obj:`InstanceData`]: Refined bboxes of each image. - - Example: - >>> # xdoctest: +REQUIRES(module:kwarray) - >>> import numpy as np - >>> from mmdet.models.task_modules.samplers. - ... sampling_result import random_boxes - >>> from mmdet.models.task_modules.samplers import SamplingResult - >>> self = BBoxHead(reg_class_agnostic=True) - >>> n_roi = 2 - >>> n_img = 4 - >>> scale = 512 - >>> rng = np.random.RandomState(0) - ... batch_img_metas = [{'img_shape': (scale, scale)} - >>> for _ in range(n_img)] - >>> sampling_results = [SamplingResult.random(rng=10) - ... for _ in range(n_img)] - >>> # Create rois in the expected format - >>> roi_boxes = random_boxes(n_roi, scale=scale, rng=rng) - >>> img_ids = torch.randint(0, n_img, (n_roi,)) - >>> img_ids = img_ids.float() - >>> rois = torch.cat([img_ids[:, None], roi_boxes], dim=1) - >>> # Create other args - >>> labels = torch.randint(0, 81, (scale,)).long() - >>> bbox_preds = random_boxes(n_roi, scale=scale, rng=rng) - >>> cls_score = torch.randn((scale, 81)) - ... # For each image, pretend random positive boxes are gts - >>> bbox_targets = (labels, None, None, None) - ... bbox_results = dict(rois=rois, bbox_pred=bbox_preds, - ... cls_score=cls_score, - ... bbox_targets=bbox_targets) - >>> bboxes_list = self.refine_bboxes(sampling_results, - ... bbox_results, - ... batch_img_metas) - >>> print(bboxes_list) - """ - # bbox_targets is a tuple - cls_scores = bbox_results['cls_score'] - rois = bbox_results['rois'] - bbox_preds = bbox_results['bbox_pred'] - if self.custom_activation: - # TODO: Create a SeasawBBoxHead to simplified logic in BBoxHead - cls_scores = self.loss_cls.get_activation(cls_scores) - if cls_scores.numel() == 0: - return None - if cls_scores.shape[-1] == self.num_classes + 1: - # remove background class - cls_scores = cls_scores[:, :-1] - elif cls_scores.shape[-1] != self.num_classes: - raise ValueError('The last dim of `cls_scores` should equal to ' - '`num_classes` or `num_classes + 1`,' - f'but got {cls_scores.shape[-1]}.') - - img_ids = rois[:, 0].long().unique(sorted=True) - assert img_ids.numel() <= len(batch_img_metas) - - results_list = [] - for i in range(len(batch_img_metas)): - inds = torch.nonzero( - rois[:, 0] == i, as_tuple=False).squeeze(dim=1) - - bboxes_ = rois[inds, 1:] - bbox_pred_ = bbox_preds[inds] - img_meta_ = batch_img_metas[i] - - bboxes = self.regress(bboxes_, bbox_pred_, img_meta_) - - # don't filter gt bboxes like D2 - results = InstanceData(bboxes=bboxes) - results_list.append(results) - - return results_list - - def regress(self, priors: Tensor, bbox_pred: Tensor, - img_meta: dict) -> Tensor: - """Regress the bbox for the predicted class. Used in Cascade R-CNN. - - Args: - priors (Tensor): Priors from `rpn_head` or last stage - `bbox_head`, has shape (num_proposals, 4). - label (Tensor): Only used when `self.reg_class_agnostic` - is False, has shape (num_proposals, ). - bbox_pred (Tensor): Regression prediction of - current stage `bbox_head`. When `self.reg_class_agnostic` - is False, it has shape (n, num_classes * 4), otherwise - it has shape (n, 4). - img_meta (dict): Image meta info. - - Returns: - Tensor: Regressed bboxes, the same shape as input rois. - """ - reg_dim = self.bbox_coder.encode_size - assert bbox_pred.size()[1] == reg_dim - - max_shape = img_meta['img_shape'] - regressed_bboxes = self.bbox_coder.decode( - priors, bbox_pred, max_shape=max_shape) - return regressed_bboxes diff --git a/projects/Detic_new/detic/detic_roi_head.py b/projects/Detic_new/detic/detic_roi_head.py deleted file mode 100644 index 35785cda7..000000000 --- a/projects/Detic_new/detic/detic_roi_head.py +++ /dev/null @@ -1,440 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from typing import List, Sequence, Tuple - -import torch -from mmengine.structures import InstanceData -from torch import Tensor - -from mmdet.models.roi_heads import CascadeRoIHead -from mmdet.models.task_modules.samplers import SamplingResult -from mmdet.models.test_time_augs import merge_aug_masks -from mmdet.models.utils import empty_instances, unpack_gt_instances -from mmdet.registry import MODELS -from mmdet.structures import SampleList -from mmdet.structures.bbox import bbox2roi, get_box_tensor -from mmdet.utils import ConfigType, InstanceList, MultiConfig - - -@MODELS.register_module() -class DeticRoIHead(CascadeRoIHead): - - def __init__( - self, - *, - mult_proposal_score: bool = False, - with_image_labels: bool = False, - add_image_box: bool = False, - image_box_size: float = 1.0, - ws_num_props: int = 128, - add_feature_to_prop: bool = False, - mask_weight: float = 1.0, - one_class_per_proposal: bool = False, - **kwargs, - ): - super().__init__(**kwargs) - self.mult_proposal_score = mult_proposal_score - self.with_image_labels = with_image_labels - self.add_image_box = add_image_box - self.image_box_size = image_box_size - self.ws_num_props = ws_num_props - self.add_feature_to_prop = add_feature_to_prop - self.mask_weight = mask_weight - self.one_class_per_proposal = one_class_per_proposal - - def init_mask_head(self, mask_roi_extractor: MultiConfig, - mask_head: MultiConfig) -> None: - """Initialize mask head and mask roi extractor. - - Args: - mask_head (dict): Config of mask in mask head. - mask_roi_extractor (:obj:`ConfigDict`, dict or list): - Config of mask roi extractor. - """ - self.mask_head = MODELS.build(mask_head) - - if mask_roi_extractor is not None: - self.share_roi_extractor = False - self.mask_roi_extractor = MODELS.build(mask_roi_extractor) - else: - self.share_roi_extractor = True - self.mask_roi_extractor = self.bbox_roi_extractor - - def _refine_roi(self, x: Tuple[Tensor], rois: Tensor, - batch_img_metas: List[dict], - num_proposals_per_img: Sequence[int], **kwargs) -> tuple: - """Multi-stage refinement of RoI. - - Args: - x (tuple[Tensor]): List of multi-level img features. - rois (Tensor): shape (n, 5), [batch_ind, x1, y1, x2, y2] - batch_img_metas (list[dict]): List of image information. - num_proposals_per_img (sequence[int]): number of proposals - in each image. - - Returns: - tuple: - - - rois (Tensor): Refined RoI. - - cls_scores (list[Tensor]): Average predicted - cls score per image. - - bbox_preds (list[Tensor]): Bbox branch predictions - for the last stage of per image. - """ - # "ms" in variable names means multi-stage - ms_scores = [] - for stage in range(self.num_stages): - bbox_results = self._bbox_forward( - stage=stage, x=x, rois=rois, **kwargs) - - # split batch bbox prediction back to each image - cls_scores = bbox_results['cls_score'].sigmoid() - bbox_preds = bbox_results['bbox_pred'] - - rois = rois.split(num_proposals_per_img, 0) - cls_scores = cls_scores.split(num_proposals_per_img, 0) - ms_scores.append(cls_scores) - bbox_preds = bbox_preds.split(num_proposals_per_img, 0) - - if stage < self.num_stages - 1: - bbox_head = self.bbox_head[stage] - refine_rois_list = [] - for i in range(len(batch_img_metas)): - if rois[i].shape[0] > 0: - bbox_label = cls_scores[i][:, :-1].argmax(dim=1) - # Refactor `bbox_head.regress_by_class` to only accept - # box tensor without img_idx concatenated. - refined_bboxes = bbox_head.regress_by_class( - rois[i][:, 1:], bbox_label, bbox_preds[i], - batch_img_metas[i]) - refined_bboxes = get_box_tensor(refined_bboxes) - refined_rois = torch.cat( - [rois[i][:, [0]], refined_bboxes], dim=1) - refine_rois_list.append(refined_rois) - rois = torch.cat(refine_rois_list) - # ms_scores aligned - # average scores of each image by stages - cls_scores = [ - sum([score[i] for score in ms_scores]) / float(len(ms_scores)) - for i in range(len(batch_img_metas)) - ] # aligned - return rois, cls_scores, bbox_preds - - def predict_bbox(self, - x: Tuple[Tensor], - batch_img_metas: List[dict], - rpn_results_list: InstanceList, - rcnn_test_cfg: ConfigType, - rescale: bool = False, - **kwargs) -> InstanceList: - """Perform forward propagation of the bbox head and predict detection - results on the features of the upstream network. - - Args: - x (tuple[Tensor]): Feature maps of all scale level. - batch_img_metas (list[dict]): List of image information. - rpn_results_list (list[:obj:`InstanceData`]): List of region - proposals. - rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of R-CNN. - rescale (bool): If True, return boxes in original image space. - Defaults to False. - - Returns: - list[:obj:`InstanceData`]: Detection results of each image - after the post process. - Each item usually contains following keys. - - - scores (Tensor): Classification scores, has a shape - (num_instance, ) - - labels (Tensor): Labels of bboxes, has a shape - (num_instances, ). - - bboxes (Tensor): Has a shape (num_instances, 4), - the last dimension 4 arrange as (x1, y1, x2, y2). - """ - proposals = [res.bboxes for res in rpn_results_list] - proposal_scores = [res.scores for res in rpn_results_list] - num_proposals_per_img = tuple(len(p) for p in proposals) - rois = bbox2roi(proposals) - - if rois.shape[0] == 0: - return empty_instances( - batch_img_metas, - rois.device, - task_type='bbox', - box_type=self.bbox_head[-1].predict_box_type, - num_classes=self.bbox_head[-1].num_classes, - score_per_cls=rcnn_test_cfg is None) - # rois aligned - rois, cls_scores, bbox_preds = self._refine_roi( - x=x, - rois=rois, - batch_img_metas=batch_img_metas, - num_proposals_per_img=num_proposals_per_img, - **kwargs) - - # score reweighting in centernet2 - cls_scores = [(s * ps[:, None])**0.5 - for s, ps in zip(cls_scores, proposal_scores)] - # # for demo - # cls_scores = [ - # s * (s == s[:, :-1].max(dim=1)[0][:, None]).float() - # for s in cls_scores - # ] - - # fast_rcnn_inference - results_list = self.bbox_head[-1].predict_by_feat( - rois=rois, - cls_scores=cls_scores, - bbox_preds=bbox_preds, - batch_img_metas=batch_img_metas, - rescale=rescale, - rcnn_test_cfg=rcnn_test_cfg) - return results_list - - def _mask_forward(self, x: Tuple[Tensor], rois: Tensor) -> dict: - """Mask head forward function used in both training and testing. - - Args: - stage (int): The current stage in Cascade RoI Head. - x (tuple[Tensor]): Tuple of multi-level img features. - rois (Tensor): RoIs with the shape (n, 5) where the first - column indicates batch id of each RoI. - - Returns: - dict: Usually returns a dictionary with keys: - - - `mask_preds` (Tensor): Mask prediction. - """ - mask_feats = self.mask_roi_extractor( - x[:self.mask_roi_extractor.num_inputs], rois) - # do not support caffe_c4 model anymore - mask_preds = self.mask_head(mask_feats) - - mask_results = dict(mask_preds=mask_preds) - return mask_results - - def mask_loss(self, x, sampling_results: List[SamplingResult], - batch_gt_instances: InstanceList) -> dict: - """Run forward function and calculate loss for mask head in training. - - Args: - x (tuple[Tensor]): Tuple of multi-level img features. - sampling_results (list["obj:`SamplingResult`]): Sampling results. - batch_gt_instances (list[:obj:`InstanceData`]): Batch of - gt_instance. It usually includes ``bboxes``, ``labels``, and - ``masks`` attributes. - - Returns: - dict: Usually returns a dictionary with keys: - - - `mask_preds` (Tensor): Mask prediction. - - `loss_mask` (dict): A dictionary of mask loss components. - """ - pos_rois = bbox2roi([res.pos_priors for res in sampling_results]) - mask_results = self._mask_forward(x, pos_rois) - - mask_loss_and_target = self.mask_head.loss_and_target( - mask_preds=mask_results['mask_preds'], - sampling_results=sampling_results, - batch_gt_instances=batch_gt_instances, - rcnn_train_cfg=self.train_cfg[-1]) - mask_results.update(mask_loss_and_target) - - return mask_results - - def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList, - batch_data_samples: SampleList) -> dict: - """Perform forward propagation and loss calculation of the detection - roi on the features of the upstream network. - - Args: - x (tuple[Tensor]): List of multi-level img features. - rpn_results_list (list[:obj:`InstanceData`]): List of region - proposals. - batch_data_samples (list[:obj:`DetDataSample`]): The batch - data samples. It usually includes information such - as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. - - Returns: - dict[str, Tensor]: A dictionary of loss components - """ - assert len(rpn_results_list) == len(batch_data_samples) - outputs = unpack_gt_instances(batch_data_samples) - batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \ - = outputs - - num_imgs = len(batch_data_samples) - image_labels = [x.gt_instances.labels for x in batch_data_samples] - losses = dict() - results_list = rpn_results_list - - for stage in range(self.num_stages): - self.current_stage = stage - stage_loss_weight = self.stage_loss_weights[stage] - if hasattr(batch_gt_instances[0], 'bboxes'): - # assign gts and sample proposals - sampling_results = [] - if self.with_bbox or self.with_mask: - bbox_assigner = self.bbox_assigner[stage] - bbox_sampler = self.bbox_sampler[stage] - - for i in range(num_imgs): - results = results_list[i] - # rename rpn_results.bboxes to rpn_results.priors - results.priors = results.pop('bboxes') - - assign_result = bbox_assigner.assign( - results, batch_gt_instances[i], - batch_gt_instances_ignore[i]) - - sampling_result = bbox_sampler.sample( - assign_result, - results, - batch_gt_instances[i], - feats=[lvl_feat[i][None] for lvl_feat in x]) - - sampling_results.append(sampling_result) - - # bbox head forward and loss - bbox_results = self.bbox_loss(stage, x, sampling_results) - - for name, value in bbox_results['loss_bbox'].items(): - losses[f's{stage}.{name}'] = ( - value * stage_loss_weight if 'loss' in name else value) - losses[f's{stage}.image_loss'] = x[0].new_zeros([1])[0] - - # mask head forward and loss - # D2 only forward stage.0 - if self.with_mask and stage == 0: - mask_results = self.mask_loss(x, sampling_results, - batch_gt_instances) - for name, value in mask_results['loss_mask'].items(): - losses[name] = ( - value * - stage_loss_weight if 'loss' in name else value) - - else: - # get ws_num_props pred_instances for each image - sampling_results = [ - pred_instances[:self.ws_num_props] - for pred_instances in results_list - ] - for i, pred_instances in enumerate(sampling_results): - pred_instances.bboxes = pred_instances.bboxes.detach() - bbox_results = self.image_loss(stage, x, sampling_results, - image_labels) - losses[f's{stage}.image_loss'] = bbox_results['image_loss'] - - for name in ['loss_cls', 'loss_bbox']: - losses[f's{stage}.{name}'] = x[0].new_zeros([1])[0] - if stage == 0: - losses['loss_mask'] = x[0].new_zeros([1])[0] - - # refine bboxes - if stage < self.num_stages - 1: - bbox_head = self.bbox_head[stage] - with torch.no_grad(): - results_list = bbox_head.refine_bboxes( - bbox_results, batch_img_metas) - # Empty proposal - if results_list is None: - break - - return losses - - def image_loss(self, stage: int, x: Tuple[Tensor], - sampling_results: List[SamplingResult], - image_labels) -> dict: - """Run forward function and calculate loss for box head in training. - - Args: - stage (int): The current stage in Cascade RoI Head. - x (tuple[Tensor]): List of multi-level img features. - sampling_results (list["obj:`SamplingResult`]): Sampling results. - - Returns: - dict: Usually returns a dictionary with keys: - - - `cls_score` (Tensor): Classification scores. - - `bbox_pred` (Tensor): Box energies / deltas. - - `bbox_feats` (Tensor): Extract bbox RoI features. - - `loss_bbox` (dict): A dictionary of bbox loss components. - - `rois` (Tensor): RoIs with the shape (n, 5) where the first - column indicates batch id of each RoI. - - `bbox_targets` (tuple): Ground truth for proposals in a - single image. Containing the following list of Tensors: - (labels, label_weights, bbox_targets, bbox_weights) - """ - bbox_head = self.bbox_head[stage] - rois = bbox2roi([res.bboxes for res in sampling_results]) - bbox_results = self._bbox_forward(stage, x, rois) - bbox_results.update(rois=rois) - - image_loss = bbox_head.image_label_losses( - cls_score=bbox_results['cls_score'], - sampling_results=sampling_results, - image_labels=image_labels) - bbox_results.update(dict(image_loss=image_loss)) - - return bbox_results - - def predict_mask(self, - x: Tuple[Tensor], - batch_img_metas: List[dict], - results_list: List[InstanceData], - rescale: bool = False) -> List[InstanceData]: - """Perform forward propagation of the mask head and predict detection - results on the features of the upstream network. - - Args: - x (tuple[Tensor]): Feature maps of all scale level. - batch_img_metas (list[dict]): List of image information. - results_list (list[:obj:`InstanceData`]): Detection results of - each image. - rescale (bool): If True, return boxes in original image space. - Defaults to False. - - Returns: - list[:obj:`InstanceData`]: Detection results of each image - after the post process. - Each item usually contains following keys. - - - scores (Tensor): Classification scores, has a shape - (num_instance, ) - - labels (Tensor): Labels of bboxes, has a shape - (num_instances, ). - - bboxes (Tensor): Has a shape (num_instances, 4), - the last dimension 4 arrange as (x1, y1, x2, y2). - - masks (Tensor): Has a shape (num_instances, H, W). - """ - bboxes = [res.bboxes for res in results_list] - mask_rois = bbox2roi(bboxes) - if mask_rois.shape[0] == 0: - results_list = empty_instances( - batch_img_metas, - mask_rois.device, - task_type='mask', - instance_results=results_list, - mask_thr_binary=self.test_cfg.mask_thr_binary) - return results_list - - num_mask_rois_per_img = [len(res) for res in results_list] - aug_masks = [] - mask_results = self._mask_forward(x, mask_rois) - mask_preds = mask_results['mask_preds'] - # split batch mask prediction back to each image - mask_preds = mask_preds.split(num_mask_rois_per_img, 0) - aug_masks.append([m.sigmoid().detach() for m in mask_preds]) - - merged_masks = [] - for i in range(len(batch_img_metas)): - aug_mask = [mask[i] for mask in aug_masks] - merged_mask = merge_aug_masks(aug_mask, batch_img_metas[i]) - merged_masks.append(merged_mask) - results_list = self.mask_head.predict_by_feat( - mask_preds=merged_masks, - results_list=results_list, - batch_img_metas=batch_img_metas, - rcnn_test_cfg=self.test_cfg, - rescale=rescale, - activate_map=True) - return results_list diff --git a/projects/Detic_new/detic/heatmap_focal_loss.py b/projects/Detic_new/detic/heatmap_focal_loss.py deleted file mode 100644 index 021a5b22d..000000000 --- a/projects/Detic_new/detic/heatmap_focal_loss.py +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from typing import Optional, Union - -import torch -import torch.nn as nn -from torch import Tensor - -from mmdet.registry import MODELS - - -# support class-agnostic heatmap_focal_loss -def heatmap_focal_loss_with_pos_inds( - pred: Tensor, - targets: Tensor, - pos_inds: Tensor, - alpha: float = 2.0, - beta: float = 4.0, - gamma: float = 4.0, - sigmoid_clamp: float = 1e-4, - ignore_high_fp: float = -1.0, - pos_weight: float = 1.0, - neg_weight: float = 1.0, - avg_factor: Optional[Union[int, float]] = None) -> Tensor: - - pred = torch.clamp( - pred.sigmoid_(), min=sigmoid_clamp, max=1 - sigmoid_clamp) - - neg_weights = torch.pow(1 - targets, beta) - - pos_pred = pred[pos_inds] - pos_loss = torch.log(pos_pred) * torch.pow(1 - pos_pred, gamma) - neg_loss = torch.log(1 - pred) * torch.pow(pred, gamma) * neg_weights - if ignore_high_fp > 0: - not_high_fp = (pred < ignore_high_fp).float() - neg_loss = not_high_fp * neg_loss - - pos_loss = -pos_loss.sum() - neg_loss = -neg_loss.sum() - if alpha >= 0: - pos_loss = alpha * pos_loss - neg_loss = (1 - alpha) * neg_loss - - pos_loss = pos_weight * pos_loss / avg_factor - neg_loss = neg_weight * neg_loss / avg_factor - - return pos_loss, neg_loss - - -@MODELS.register_module() -class HeatmapFocalLoss(nn.Module): - """GaussianFocalLoss is a variant of focal loss. - - More details can be found in the `paper - `_ - Code is modified from `kp_utils.py - `_ # noqa: E501 - Please notice that the target in GaussianFocalLoss is a gaussian heatmap, - not 0/1 binary target. - - Args: - alpha (float): Power of prediction. - gamma (float): Power of target for negative samples. - reduction (str): Options are "none", "mean" and "sum". - loss_weight (float): Loss weight of current loss. - pos_weight(float): Positive sample loss weight. Defaults to 1.0. - neg_weight(float): Negative sample loss weight. Defaults to 1.0. - """ - - def __init__( - self, - alpha: float = 2.0, - beta: float = 4.0, - gamma: float = 4.0, - sigmoid_clamp: float = 1e-4, - ignore_high_fp: float = -1.0, - loss_weight: float = 1.0, - pos_weight: float = 1.0, - neg_weight: float = 1.0, - ) -> None: - super().__init__() - self.alpha = alpha - self.beta = beta - self.gamma = gamma - self.sigmoid_clamp = sigmoid_clamp - self.ignore_high_fp = ignore_high_fp - self.loss_weight = loss_weight - self.pos_weight = pos_weight - self.neg_weight = neg_weight - - def forward(self, - pred: Tensor, - target: Tensor, - pos_inds: Optional[Tensor] = None, - avg_factor: Optional[Union[int, float]] = None) -> Tensor: - """Forward function. - - If you want to manually determine which positions are - positive samples, you can set the pos_index and pos_label - parameter. Currently, only the CenterNet update version uses - the parameter. - - Args: - pred (torch.Tensor): The prediction. The shape is (N, num_classes). - target (torch.Tensor): The learning target of the prediction - in gaussian distribution. The shape is (N, num_classes). - pos_inds (torch.Tensor): The positive sample index. - Defaults to None. - pos_labels (torch.Tensor): The label corresponding to the positive - sample index. Defaults to None. - weight (torch.Tensor, optional): The weight of loss for each - prediction. Defaults to None. - avg_factor (int, float, optional): Average factor that is used to - average the loss. Defaults to None. - reduction_override (str, optional): The reduction method used to - override the original reduction method of the loss. - Defaults to None. - """ - - pos_loss, neg_loss = heatmap_focal_loss_with_pos_inds( - pred, - target, - pos_inds, - alpha=self.alpha, - beta=self.beta, - gamma=self.gamma, - sigmoid_clamp=self.sigmoid_clamp, - ignore_high_fp=self.ignore_high_fp, - pos_weight=self.pos_weight, - neg_weight=self.neg_weight, - avg_factor=avg_factor) - return pos_loss, neg_loss diff --git a/projects/Detic_new/detic/imagenet_lvis.py b/projects/Detic_new/detic/imagenet_lvis.py deleted file mode 100644 index 3375a0866..000000000 --- a/projects/Detic_new/detic/imagenet_lvis.py +++ /dev/null @@ -1,395 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved.METAINFO -import copy -import os.path as osp -import pickle -import warnings -from typing import List, Union - -from mmengine.fileio import get_local_path - -from mmdet.datasets import LVISV1Dataset -from mmdet.registry import DATASETS - - -@DATASETS.register_module() -class ImageNetLVISV1Dataset(LVISV1Dataset): - """LVIS v1 dataset for detection.""" - - METAINFO = { - 'classes': - ('aerosol_can', 'air_conditioner', 'airplane', 'alarm_clock', - 'alcohol', 'alligator', 'almond', 'ambulance', 'amplifier', 'anklet', - 'antenna', 'apple', 'applesauce', 'apricot', 'apron', 'aquarium', - 'arctic_(type_of_shoe)', 'armband', 'armchair', 'armoire', 'armor', - 'artichoke', 'trash_can', 'ashtray', 'asparagus', 'atomizer', - 'avocado', 'award', 'awning', 'ax', 'baboon', 'baby_buggy', - 'basketball_backboard', 'backpack', 'handbag', 'suitcase', 'bagel', - 'bagpipe', 'baguet', 'bait', 'ball', 'ballet_skirt', 'balloon', - 'bamboo', 'banana', 'Band_Aid', 'bandage', 'bandanna', 'banjo', - 'banner', 'barbell', 'barge', 'barrel', 'barrette', 'barrow', - 'baseball_base', 'baseball', 'baseball_bat', 'baseball_cap', - 'baseball_glove', 'basket', 'basketball', 'bass_horn', 'bat_(animal)', - 'bath_mat', 'bath_towel', 'bathrobe', 'bathtub', 'batter_(food)', - 'battery', 'beachball', 'bead', 'bean_curd', 'beanbag', 'beanie', - 'bear', 'bed', 'bedpan', 'bedspread', 'cow', 'beef_(food)', 'beeper', - 'beer_bottle', 'beer_can', 'beetle', 'bell', 'bell_pepper', 'belt', - 'belt_buckle', 'bench', 'beret', 'bib', 'Bible', 'bicycle', 'visor', - 'billboard', 'binder', 'binoculars', 'bird', 'birdfeeder', 'birdbath', - 'birdcage', 'birdhouse', 'birthday_cake', 'birthday_card', - 'pirate_flag', 'black_sheep', 'blackberry', 'blackboard', 'blanket', - 'blazer', 'blender', 'blimp', 'blinker', 'blouse', 'blueberry', - 'gameboard', 'boat', 'bob', 'bobbin', 'bobby_pin', 'boiled_egg', - 'bolo_tie', 'deadbolt', 'bolt', 'bonnet', 'book', 'bookcase', - 'booklet', 'bookmark', 'boom_microphone', 'boot', 'bottle', - 'bottle_opener', 'bouquet', 'bow_(weapon)', - 'bow_(decorative_ribbons)', 'bow-tie', 'bowl', 'pipe_bowl', - 'bowler_hat', 'bowling_ball', 'box', 'boxing_glove', 'suspenders', - 'bracelet', 'brass_plaque', 'brassiere', 'bread-bin', 'bread', - 'breechcloth', 'bridal_gown', 'briefcase', 'broccoli', 'broach', - 'broom', 'brownie', 'brussels_sprouts', 'bubble_gum', 'bucket', - 'horse_buggy', 'bull', 'bulldog', 'bulldozer', 'bullet_train', - 'bulletin_board', 'bulletproof_vest', 'bullhorn', 'bun', 'bunk_bed', - 'buoy', 'burrito', 'bus_(vehicle)', 'business_card', 'butter', - 'butterfly', 'button', 'cab_(taxi)', 'cabana', 'cabin_car', 'cabinet', - 'locker', 'cake', 'calculator', 'calendar', 'calf', 'camcorder', - 'camel', 'camera', 'camera_lens', 'camper_(vehicle)', 'can', - 'can_opener', 'candle', 'candle_holder', 'candy_bar', 'candy_cane', - 'walking_cane', 'canister', 'canoe', 'cantaloup', 'canteen', - 'cap_(headwear)', 'bottle_cap', 'cape', 'cappuccino', - 'car_(automobile)', 'railcar_(part_of_a_train)', 'elevator_car', - 'car_battery', 'identity_card', 'card', 'cardigan', 'cargo_ship', - 'carnation', 'horse_carriage', 'carrot', 'tote_bag', 'cart', 'carton', - 'cash_register', 'casserole', 'cassette', 'cast', 'cat', - 'cauliflower', 'cayenne_(spice)', 'CD_player', 'celery', - 'cellular_telephone', 'chain_mail', 'chair', 'chaise_longue', - 'chalice', 'chandelier', 'chap', 'checkbook', 'checkerboard', - 'cherry', 'chessboard', 'chicken_(animal)', 'chickpea', - 'chili_(vegetable)', 'chime', 'chinaware', 'crisp_(potato_chip)', - 'poker_chip', 'chocolate_bar', 'chocolate_cake', 'chocolate_milk', - 'chocolate_mousse', 'choker', 'chopping_board', 'chopstick', - 'Christmas_tree', 'slide', 'cider', 'cigar_box', 'cigarette', - 'cigarette_case', 'cistern', 'clarinet', 'clasp', 'cleansing_agent', - 'cleat_(for_securing_rope)', 'clementine', 'clip', 'clipboard', - 'clippers_(for_plants)', 'cloak', 'clock', 'clock_tower', - 'clothes_hamper', 'clothespin', 'clutch_bag', 'coaster', 'coat', - 'coat_hanger', 'coatrack', 'cock', 'cockroach', 'cocoa_(beverage)', - 'coconut', 'coffee_maker', 'coffee_table', 'coffeepot', 'coil', - 'coin', 'colander', 'coleslaw', 'coloring_material', - 'combination_lock', 'pacifier', 'comic_book', 'compass', - 'computer_keyboard', 'condiment', 'cone', 'control', - 'convertible_(automobile)', 'sofa_bed', 'cooker', 'cookie', - 'cooking_utensil', 'cooler_(for_food)', 'cork_(bottle_plug)', - 'corkboard', 'corkscrew', 'edible_corn', 'cornbread', 'cornet', - 'cornice', 'cornmeal', 'corset', 'costume', 'cougar', 'coverall', - 'cowbell', 'cowboy_hat', 'crab_(animal)', 'crabmeat', 'cracker', - 'crape', 'crate', 'crayon', 'cream_pitcher', 'crescent_roll', 'crib', - 'crock_pot', 'crossbar', 'crouton', 'crow', 'crowbar', 'crown', - 'crucifix', 'cruise_ship', 'police_cruiser', 'crumb', 'crutch', - 'cub_(animal)', 'cube', 'cucumber', 'cufflink', 'cup', 'trophy_cup', - 'cupboard', 'cupcake', 'hair_curler', 'curling_iron', 'curtain', - 'cushion', 'cylinder', 'cymbal', 'dagger', 'dalmatian', 'dartboard', - 'date_(fruit)', 'deck_chair', 'deer', 'dental_floss', 'desk', - 'detergent', 'diaper', 'diary', 'die', 'dinghy', 'dining_table', - 'tux', 'dish', 'dish_antenna', 'dishrag', 'dishtowel', 'dishwasher', - 'dishwasher_detergent', 'dispenser', 'diving_board', 'Dixie_cup', - 'dog', 'dog_collar', 'doll', 'dollar', 'dollhouse', 'dolphin', - 'domestic_ass', 'doorknob', 'doormat', 'doughnut', 'dove', - 'dragonfly', 'drawer', 'underdrawers', 'dress', 'dress_hat', - 'dress_suit', 'dresser', 'drill', 'drone', 'dropper', - 'drum_(musical_instrument)', 'drumstick', 'duck', 'duckling', - 'duct_tape', 'duffel_bag', 'dumbbell', 'dumpster', 'dustpan', 'eagle', - 'earphone', 'earplug', 'earring', 'easel', 'eclair', 'eel', 'egg', - 'egg_roll', 'egg_yolk', 'eggbeater', 'eggplant', 'electric_chair', - 'refrigerator', 'elephant', 'elk', 'envelope', 'eraser', 'escargot', - 'eyepatch', 'falcon', 'fan', 'faucet', 'fedora', 'ferret', - 'Ferris_wheel', 'ferry', 'fig_(fruit)', 'fighter_jet', 'figurine', - 'file_cabinet', 'file_(tool)', 'fire_alarm', 'fire_engine', - 'fire_extinguisher', 'fire_hose', 'fireplace', 'fireplug', - 'first-aid_kit', 'fish', 'fish_(food)', 'fishbowl', 'fishing_rod', - 'flag', 'flagpole', 'flamingo', 'flannel', 'flap', 'flash', - 'flashlight', 'fleece', 'flip-flop_(sandal)', 'flipper_(footwear)', - 'flower_arrangement', 'flute_glass', 'foal', 'folding_chair', - 'food_processor', 'football_(American)', 'football_helmet', - 'footstool', 'fork', 'forklift', 'freight_car', 'French_toast', - 'freshener', 'frisbee', 'frog', 'fruit_juice', 'frying_pan', 'fudge', - 'funnel', 'futon', 'gag', 'garbage', 'garbage_truck', 'garden_hose', - 'gargle', 'gargoyle', 'garlic', 'gasmask', 'gazelle', 'gelatin', - 'gemstone', 'generator', 'giant_panda', 'gift_wrap', 'ginger', - 'giraffe', 'cincture', 'glass_(drink_container)', 'globe', 'glove', - 'goat', 'goggles', 'goldfish', 'golf_club', 'golfcart', - 'gondola_(boat)', 'goose', 'gorilla', 'gourd', 'grape', 'grater', - 'gravestone', 'gravy_boat', 'green_bean', 'green_onion', 'griddle', - 'grill', 'grits', 'grizzly', 'grocery_bag', 'guitar', 'gull', 'gun', - 'hairbrush', 'hairnet', 'hairpin', 'halter_top', 'ham', 'hamburger', - 'hammer', 'hammock', 'hamper', 'hamster', 'hair_dryer', 'hand_glass', - 'hand_towel', 'handcart', 'handcuff', 'handkerchief', 'handle', - 'handsaw', 'hardback_book', 'harmonium', 'hat', 'hatbox', 'veil', - 'headband', 'headboard', 'headlight', 'headscarf', 'headset', - 'headstall_(for_horses)', 'heart', 'heater', 'helicopter', 'helmet', - 'heron', 'highchair', 'hinge', 'hippopotamus', 'hockey_stick', 'hog', - 'home_plate_(baseball)', 'honey', 'fume_hood', 'hook', 'hookah', - 'hornet', 'horse', 'hose', 'hot-air_balloon', 'hotplate', 'hot_sauce', - 'hourglass', 'houseboat', 'hummingbird', 'hummus', 'polar_bear', - 'icecream', 'popsicle', 'ice_maker', 'ice_pack', 'ice_skate', - 'igniter', 'inhaler', 'iPod', 'iron_(for_clothing)', 'ironing_board', - 'jacket', 'jam', 'jar', 'jean', 'jeep', 'jelly_bean', 'jersey', - 'jet_plane', 'jewel', 'jewelry', 'joystick', 'jumpsuit', 'kayak', - 'keg', 'kennel', 'kettle', 'key', 'keycard', 'kilt', 'kimono', - 'kitchen_sink', 'kitchen_table', 'kite', 'kitten', 'kiwi_fruit', - 'knee_pad', 'knife', 'knitting_needle', 'knob', 'knocker_(on_a_door)', - 'koala', 'lab_coat', 'ladder', 'ladle', 'ladybug', 'lamb_(animal)', - 'lamb-chop', 'lamp', 'lamppost', 'lampshade', 'lantern', 'lanyard', - 'laptop_computer', 'lasagna', 'latch', 'lawn_mower', 'leather', - 'legging_(clothing)', 'Lego', 'legume', 'lemon', 'lemonade', - 'lettuce', 'license_plate', 'life_buoy', 'life_jacket', 'lightbulb', - 'lightning_rod', 'lime', 'limousine', 'lion', 'lip_balm', 'liquor', - 'lizard', 'log', 'lollipop', 'speaker_(stereo_equipment)', 'loveseat', - 'machine_gun', 'magazine', 'magnet', 'mail_slot', 'mailbox_(at_home)', - 'mallard', 'mallet', 'mammoth', 'manatee', 'mandarin_orange', - 'manger', 'manhole', 'map', 'marker', 'martini', 'mascot', - 'mashed_potato', 'masher', 'mask', 'mast', 'mat_(gym_equipment)', - 'matchbox', 'mattress', 'measuring_cup', 'measuring_stick', - 'meatball', 'medicine', 'melon', 'microphone', 'microscope', - 'microwave_oven', 'milestone', 'milk', 'milk_can', 'milkshake', - 'minivan', 'mint_candy', 'mirror', 'mitten', 'mixer_(kitchen_tool)', - 'money', 'monitor_(computer_equipment) computer_monitor', 'monkey', - 'motor', 'motor_scooter', 'motor_vehicle', 'motorcycle', - 'mound_(baseball)', 'mouse_(computer_equipment)', 'mousepad', - 'muffin', 'mug', 'mushroom', 'music_stool', 'musical_instrument', - 'nailfile', 'napkin', 'neckerchief', 'necklace', 'necktie', 'needle', - 'nest', 'newspaper', 'newsstand', 'nightshirt', - 'nosebag_(for_animals)', 'noseband_(for_animals)', 'notebook', - 'notepad', 'nut', 'nutcracker', 'oar', 'octopus_(food)', - 'octopus_(animal)', 'oil_lamp', 'olive_oil', 'omelet', 'onion', - 'orange_(fruit)', 'orange_juice', 'ostrich', 'ottoman', 'oven', - 'overalls_(clothing)', 'owl', 'packet', 'inkpad', 'pad', 'paddle', - 'padlock', 'paintbrush', 'painting', 'pajamas', 'palette', - 'pan_(for_cooking)', 'pan_(metal_container)', 'pancake', 'pantyhose', - 'papaya', 'paper_plate', 'paper_towel', 'paperback_book', - 'paperweight', 'parachute', 'parakeet', 'parasail_(sports)', - 'parasol', 'parchment', 'parka', 'parking_meter', 'parrot', - 'passenger_car_(part_of_a_train)', 'passenger_ship', 'passport', - 'pastry', 'patty_(food)', 'pea_(food)', 'peach', 'peanut_butter', - 'pear', 'peeler_(tool_for_fruit_and_vegetables)', 'wooden_leg', - 'pegboard', 'pelican', 'pen', 'pencil', 'pencil_box', - 'pencil_sharpener', 'pendulum', 'penguin', 'pennant', 'penny_(coin)', - 'pepper', 'pepper_mill', 'perfume', 'persimmon', 'person', 'pet', - 'pew_(church_bench)', 'phonebook', 'phonograph_record', 'piano', - 'pickle', 'pickup_truck', 'pie', 'pigeon', 'piggy_bank', 'pillow', - 'pin_(non_jewelry)', 'pineapple', 'pinecone', 'ping-pong_ball', - 'pinwheel', 'tobacco_pipe', 'pipe', 'pistol', 'pita_(bread)', - 'pitcher_(vessel_for_liquid)', 'pitchfork', 'pizza', 'place_mat', - 'plate', 'platter', 'playpen', 'pliers', 'plow_(farm_equipment)', - 'plume', 'pocket_watch', 'pocketknife', 'poker_(fire_stirring_tool)', - 'pole', 'polo_shirt', 'poncho', 'pony', 'pool_table', 'pop_(soda)', - 'postbox_(public)', 'postcard', 'poster', 'pot', 'flowerpot', - 'potato', 'potholder', 'pottery', 'pouch', 'power_shovel', 'prawn', - 'pretzel', 'printer', 'projectile_(weapon)', 'projector', 'propeller', - 'prune', 'pudding', 'puffer_(fish)', 'puffin', 'pug-dog', 'pumpkin', - 'puncher', 'puppet', 'puppy', 'quesadilla', 'quiche', 'quilt', - 'rabbit', 'race_car', 'racket', 'radar', 'radiator', 'radio_receiver', - 'radish', 'raft', 'rag_doll', 'raincoat', 'ram_(animal)', 'raspberry', - 'rat', 'razorblade', 'reamer_(juicer)', 'rearview_mirror', 'receipt', - 'recliner', 'record_player', 'reflector', 'remote_control', - 'rhinoceros', 'rib_(food)', 'rifle', 'ring', 'river_boat', 'road_map', - 'robe', 'rocking_chair', 'rodent', 'roller_skate', 'Rollerblade', - 'rolling_pin', 'root_beer', 'router_(computer_equipment)', - 'rubber_band', 'runner_(carpet)', 'plastic_bag', - 'saddle_(on_an_animal)', 'saddle_blanket', 'saddlebag', 'safety_pin', - 'sail', 'salad', 'salad_plate', 'salami', 'salmon_(fish)', - 'salmon_(food)', 'salsa', 'saltshaker', 'sandal_(type_of_shoe)', - 'sandwich', 'satchel', 'saucepan', 'saucer', 'sausage', 'sawhorse', - 'saxophone', 'scale_(measuring_instrument)', 'scarecrow', 'scarf', - 'school_bus', 'scissors', 'scoreboard', 'scraper', 'screwdriver', - 'scrubbing_brush', 'sculpture', 'seabird', 'seahorse', 'seaplane', - 'seashell', 'sewing_machine', 'shaker', 'shampoo', 'shark', - 'sharpener', 'Sharpie', 'shaver_(electric)', 'shaving_cream', 'shawl', - 'shears', 'sheep', 'shepherd_dog', 'sherbert', 'shield', 'shirt', - 'shoe', 'shopping_bag', 'shopping_cart', 'short_pants', 'shot_glass', - 'shoulder_bag', 'shovel', 'shower_head', 'shower_cap', - 'shower_curtain', 'shredder_(for_paper)', 'signboard', 'silo', 'sink', - 'skateboard', 'skewer', 'ski', 'ski_boot', 'ski_parka', 'ski_pole', - 'skirt', 'skullcap', 'sled', 'sleeping_bag', 'sling_(bandage)', - 'slipper_(footwear)', 'smoothie', 'snake', 'snowboard', 'snowman', - 'snowmobile', 'soap', 'soccer_ball', 'sock', 'sofa', 'softball', - 'solar_array', 'sombrero', 'soup', 'soup_bowl', 'soupspoon', - 'sour_cream', 'soya_milk', 'space_shuttle', 'sparkler_(fireworks)', - 'spatula', 'spear', 'spectacles', 'spice_rack', 'spider', 'crawfish', - 'sponge', 'spoon', 'sportswear', 'spotlight', 'squid_(food)', - 'squirrel', 'stagecoach', 'stapler_(stapling_machine)', 'starfish', - 'statue_(sculpture)', 'steak_(food)', 'steak_knife', 'steering_wheel', - 'stepladder', 'step_stool', 'stereo_(sound_system)', 'stew', - 'stirrer', 'stirrup', 'stool', 'stop_sign', 'brake_light', 'stove', - 'strainer', 'strap', 'straw_(for_drinking)', 'strawberry', - 'street_sign', 'streetlight', 'string_cheese', 'stylus', 'subwoofer', - 'sugar_bowl', 'sugarcane_(plant)', 'suit_(clothing)', 'sunflower', - 'sunglasses', 'sunhat', 'surfboard', 'sushi', 'mop', 'sweat_pants', - 'sweatband', 'sweater', 'sweatshirt', 'sweet_potato', 'swimsuit', - 'sword', 'syringe', 'Tabasco_sauce', 'table-tennis_table', 'table', - 'table_lamp', 'tablecloth', 'tachometer', 'taco', 'tag', 'taillight', - 'tambourine', 'army_tank', 'tank_(storage_vessel)', - 'tank_top_(clothing)', 'tape_(sticky_cloth_or_paper)', 'tape_measure', - 'tapestry', 'tarp', 'tartan', 'tassel', 'tea_bag', 'teacup', - 'teakettle', 'teapot', 'teddy_bear', 'telephone', 'telephone_booth', - 'telephone_pole', 'telephoto_lens', 'television_camera', - 'television_set', 'tennis_ball', 'tennis_racket', 'tequila', - 'thermometer', 'thermos_bottle', 'thermostat', 'thimble', 'thread', - 'thumbtack', 'tiara', 'tiger', 'tights_(clothing)', 'timer', - 'tinfoil', 'tinsel', 'tissue_paper', 'toast_(food)', 'toaster', - 'toaster_oven', 'toilet', 'toilet_tissue', 'tomato', 'tongs', - 'toolbox', 'toothbrush', 'toothpaste', 'toothpick', 'cover', - 'tortilla', 'tow_truck', 'towel', 'towel_rack', 'toy', - 'tractor_(farm_equipment)', 'traffic_light', 'dirt_bike', - 'trailer_truck', 'train_(railroad_vehicle)', 'trampoline', 'tray', - 'trench_coat', 'triangle_(musical_instrument)', 'tricycle', 'tripod', - 'trousers', 'truck', 'truffle_(chocolate)', 'trunk', 'vat', 'turban', - 'turkey_(food)', 'turnip', 'turtle', 'turtleneck_(clothing)', - 'typewriter', 'umbrella', 'underwear', 'unicycle', 'urinal', 'urn', - 'vacuum_cleaner', 'vase', 'vending_machine', 'vent', 'vest', - 'videotape', 'vinegar', 'violin', 'vodka', 'volleyball', 'vulture', - 'waffle', 'waffle_iron', 'wagon', 'wagon_wheel', 'walking_stick', - 'wall_clock', 'wall_socket', 'wallet', 'walrus', 'wardrobe', - 'washbasin', 'automatic_washer', 'watch', 'water_bottle', - 'water_cooler', 'water_faucet', 'water_heater', 'water_jug', - 'water_gun', 'water_scooter', 'water_ski', 'water_tower', - 'watering_can', 'watermelon', 'weathervane', 'webcam', 'wedding_cake', - 'wedding_ring', 'wet_suit', 'wheel', 'wheelchair', 'whipped_cream', - 'whistle', 'wig', 'wind_chime', 'windmill', 'window_box_(for_plants)', - 'windshield_wiper', 'windsock', 'wine_bottle', 'wine_bucket', - 'wineglass', 'blinder_(for_horses)', 'wok', 'wolf', 'wooden_spoon', - 'wreath', 'wrench', 'wristband', 'wristlet', 'yacht', 'yogurt', - 'yoke_(animal_equipment)', 'zebra', 'zucchini'), - 'palette': - None - } - - def get_data_info(self, idx: int) -> dict: - """Get annotation by index and automatically call ``full_init`` if the - dataset has not been fully initialized. - - Args: - idx (int): The index of data. - - Returns: - dict: The idx-th annotation of the dataset. - """ - if self.serialize_data: - start_addr = 0 if idx == 0 else self.data_address[idx - 1].item() - end_addr = self.data_address[idx].item() - bytes = memoryview( - self.data_bytes[start_addr:end_addr]) # type: ignore - data_info = pickle.loads(bytes) # type: ignore - else: - data_info = copy.deepcopy(self.data_list[idx]) - - # Some codebase needs `sample_idx` of data information. Here we convert - # the idx to a positive number and save it in data information. - if idx >= 0: - data_info['sample_idx'] = idx - else: - data_info['sample_idx'] = len(self) + idx - - return data_info - - def load_data_list(self) -> List[dict]: - """Load annotations from an annotation file named as ``self.ann_file`` - - Returns: - List[dict]: A list of annotation. - """ # noqa: E501 - try: - import lvis - if getattr(lvis, '__version__', '0') >= '10.5.3': - warnings.warn( - 'mmlvis is deprecated, please install official lvis-api by "pip install git+https://github.com/lvis-dataset/lvis-api.git"', # noqa: E501 - UserWarning) - from lvis import LVIS - except ImportError: - raise ImportError( - 'Package lvis is not installed. Please run "pip install git+https://github.com/lvis-dataset/lvis-api.git".' # noqa: E501 - ) - with get_local_path( - self.ann_file, backend_args=self.backend_args) as local_path: - self.lvis = LVIS(local_path) - self.cat_ids = self.lvis.get_cat_ids() - self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)} - self.cat_img_map = copy.deepcopy(self.lvis.cat_img_map) - img_ids = self.lvis.get_img_ids() - data_list = [] - total_ann_ids = [] - for img_id in img_ids: - raw_img_info = self.lvis.load_imgs([img_id])[0] - raw_img_info['img_id'] = img_id - - ann_ids = self.lvis.get_ann_ids(img_ids=[img_id]) - total_ann_ids.extend(ann_ids) - parsed_data_info = self.parse_data_info( - {'raw_img_info': raw_img_info}) - data_list.append(parsed_data_info) - if self.ANN_ID_UNIQUE: - assert len(set(total_ann_ids)) == len( - total_ann_ids - ), f"Annotation ids in '{self.ann_file}' are not unique!" - - del self.lvis - # print(data_list) - return data_list - - def parse_data_info(self, raw_data_info: dict) -> Union[dict, List[dict]]: - """Parse raw annotation to target format. - - Args: - raw_data_info (dict): Raw data information load from ``ann_file`` - - Returns: - Union[dict, List[dict]]: Parsed annotation. - """ - img_info = raw_data_info['raw_img_info'] - - data_info = {} - - # TODO: need to change data_prefix['img'] to data_prefix['img_path'] - img_path = osp.join(self.data_prefix['img'], img_info['file_name']) - if self.data_prefix.get('seg', None): - seg_map_path = osp.join( - self.data_prefix['seg'], - img_info['file_name'].rsplit('.', 1)[0] + self.seg_map_suffix) - else: - seg_map_path = None - data_info['img_path'] = img_path - data_info['img_id'] = img_info['img_id'] - data_info['seg_map_path'] = seg_map_path - data_info['height'] = img_info['height'] - data_info['width'] = img_info['width'] - - if self.return_classes: - data_info['text'] = self.metainfo['classes'] - data_info['custom_entities'] = True - - instances = [] - image_labels = [ - self.cat2label[x] for x in img_info['pos_category_ids'] - ] - for image_label in image_labels: - instance = {} - instance['bbox_label'] = image_label - instances.append(instance) - data_info['instances'] = instances - - return data_info - - def get_cat_ids(self, idx: int) -> List[int]: - """Get COCO category ids by index. - - Args: - idx (int): Index of data. - - Returns: - List[int]: All categories in the image of specified index. - """ - data_info = self.get_data_info(idx) - image_labels = [] - for instance in data_info['instances']: - image_labels.append(instance['bbox_label']) - - return image_labels diff --git a/projects/Detic_new/detic/iou_loss.py b/projects/Detic_new/detic/iou_loss.py deleted file mode 100644 index 349545cf5..000000000 --- a/projects/Detic_new/detic/iou_loss.py +++ /dev/null @@ -1,125 +0,0 @@ -import torch -from torch import nn - - -# support calculate IOULoss with box_pred -class IOULoss(nn.Module): - - def __init__(self, loc_loss_type='iou'): - super(IOULoss, self).__init__() - self.loc_loss_type = loc_loss_type - - def forward(self, pred, target, weight=None, reduction='sum'): - pred_left = pred[:, 0] - pred_top = pred[:, 1] - pred_right = pred[:, 2] - pred_bottom = pred[:, 3] - - target_left = target[:, 0] - target_top = target[:, 1] - target_right = target[:, 2] - target_bottom = target[:, 3] - - target_aera = (target_left + target_right) * ( - target_top + target_bottom) - pred_aera = (pred_left + pred_right) * (pred_top + pred_bottom) - - w_intersect = torch.min(pred_left, target_left) + torch.min( - pred_right, target_right) - h_intersect = torch.min(pred_bottom, target_bottom) + torch.min( - pred_top, target_top) - - g_w_intersect = torch.max(pred_left, target_left) + torch.max( - pred_right, target_right) - g_h_intersect = torch.max(pred_bottom, target_bottom) + torch.max( - pred_top, target_top) - ac_uion = g_w_intersect * g_h_intersect - - area_intersect = w_intersect * h_intersect - area_union = target_aera + pred_aera - area_intersect - - ious = (area_intersect + 1.0) / (area_union + 1.0) - gious = ious - (ac_uion - area_union) / ac_uion - if self.loc_loss_type == 'iou': - losses = -torch.log(ious) - elif self.loc_loss_type == 'linear_iou': - losses = 1 - ious - elif self.loc_loss_type == 'giou': - losses = 1 - gious - else: - raise NotImplementedError - - if weight is not None: - losses = losses * weight - else: - losses = losses - - if reduction == 'sum': - return losses.sum() - elif reduction == 'batch': - return losses.sum(dim=[1]) - elif reduction == 'none': - return losses - else: - raise NotImplementedError - - -def giou_loss( - boxes1: torch.Tensor, - boxes2: torch.Tensor, - reduction: str = 'none', - eps: float = 1e-7, -) -> torch.Tensor: - """Generalized Intersection over Union Loss (Hamid Rezatofighi et. - - al) - https://arxiv.org/abs/1902.09630 - Gradient-friendly IoU loss with an additional penalty that is - non-zero when the boxes do not overlap and scales with the size - of their smallest enclosing box. This loss is symmetric, so the - boxes1 and boxes2 arguments are interchangeable. - Args: - boxes1, boxes2 (Tensor): box locations in XYXY format, shape - (N, 4) or (4,). - reduction: 'none' | 'mean' | 'sum' - 'none': No reduction will be applied to the output. - 'mean': The output will be averaged. - 'sum': The output will be summed. - eps (float): small number to prevent division by zero - """ - - x1, y1, x2, y2 = boxes1.unbind(dim=-1) - x1g, y1g, x2g, y2g = boxes2.unbind(dim=-1) - - assert (x2 >= x1).all(), 'bad box: x1 larger than x2' - assert (y2 >= y1).all(), 'bad box: y1 larger than y2' - - # Intersection keypoints - xkis1 = torch.max(x1, x1g) - ykis1 = torch.max(y1, y1g) - xkis2 = torch.min(x2, x2g) - ykis2 = torch.min(y2, y2g) - - intsctk = torch.zeros_like(x1) - mask = (ykis2 > ykis1) & (xkis2 > xkis1) - intsctk[mask] = (xkis2[mask] - xkis1[mask]) * (ykis2[mask] - ykis1[mask]) - unionk = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g) - intsctk - iouk = intsctk / (unionk + eps) - - # smallest enclosing box - xc1 = torch.min(x1, x1g) - yc1 = torch.min(y1, y1g) - xc2 = torch.max(x2, x2g) - yc2 = torch.max(y2, y2g) - - area_c = (xc2 - xc1) * (yc2 - yc1) - miouk = iouk - ((area_c - unionk) / (area_c + eps)) - - loss = 1 - miouk - - if reduction == 'mean': - loss = loss.mean() - elif reduction == 'sum': - loss = loss.sum() - - return loss diff --git a/projects/Detic_new/detic/zero_shot_classifier.py b/projects/Detic_new/detic/zero_shot_classifier.py deleted file mode 100644 index cb9946d58..000000000 --- a/projects/Detic_new/detic/zero_shot_classifier.py +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -import numpy as np -import torch -from torch import nn -from torch.nn import functional as F - -from mmdet.registry import MODELS - - -@MODELS.register_module() -class ZeroShotClassifier(nn.Module): - - def __init__( - self, - in_features: int, - out_features: int, # num_classes - zs_weight_path: str, - zs_weight_dim: int = 512, - use_bias: float = 0.0, - norm_weight: bool = True, - norm_temperature: float = 50.0, - ): - super().__init__() - num_classes = out_features - self.norm_weight = norm_weight - self.norm_temperature = norm_temperature - - self.use_bias = use_bias < 0 - if self.use_bias: - self.cls_bias = nn.Parameter(torch.ones(1) * use_bias) - - self.linear = nn.Linear(in_features, zs_weight_dim) - - if zs_weight_path == 'rand': - zs_weight = torch.randn((zs_weight_dim, num_classes)) - nn.init.normal_(zs_weight, std=0.01) - else: - zs_weight = torch.tensor( - np.load(zs_weight_path), - dtype=torch.float32).permute(1, 0).contiguous() # D x C - zs_weight = torch.cat( - [zs_weight, zs_weight.new_zeros( - (zs_weight_dim, 1))], dim=1) # D x (C + 1) - - if self.norm_weight: - zs_weight = F.normalize(zs_weight, p=2, dim=0) - - if zs_weight_path == 'rand': - self.zs_weight = nn.Parameter(zs_weight) - else: - self.register_buffer('zs_weight', zs_weight) - - assert self.zs_weight.shape[1] == num_classes + 1, self.zs_weight.shape - - def forward(self, x, classifier=None): - ''' - Inputs: - x: B x D' - classifier_info: (C', C' x D) - ''' - x = self.linear(x) - if classifier is not None: - zs_weight = classifier.permute(1, 0).contiguous() # D x C' - zs_weight = F.normalize(zs_weight, p=2, dim=0) \ - if self.norm_weight else zs_weight - else: - zs_weight = self.zs_weight - if self.norm_weight: - x = self.norm_temperature * F.normalize(x, p=2, dim=1) - x = torch.mm(x, zs_weight) - if self.use_bias: - x = x + self.cls_bias - return x diff --git a/projects/DiffusionDet/README.md b/projects/DiffusionDet/README.md deleted file mode 100644 index 5542d9a59..000000000 --- a/projects/DiffusionDet/README.md +++ /dev/null @@ -1,172 +0,0 @@ -## Description - -This is an implementation of [DiffusionDet](https://github.com/ShoufaChen/DiffusionDet) based on [MMDetection](https://github.com/open-mmlab/mmdetection/tree/main), [MMCV](https://github.com/open-mmlab/mmcv), and [MMEngine](https://github.com/open-mmlab/mmengine). - -
- -
- -## Usage - - - -### Comparison of results - -1. Download the [DiffusionDet released model](https://github.com/ShoufaChen/DiffusionDet#models). - -2. Convert model from DiffusionDet version to MMDetection version. We give a [sample script](model_converters/diffusiondet_resnet_to_mmdet.py) - to convert `DiffusionDet-resnet50` model. Users can download the corresponding models from [here](https://github.com/ShoufaChen/DiffusionDet/releases/download/v0.1/diffdet_coco_res50.pth). - - ```shell - python projects/DiffusionDet/model_converters/diffusiondet_resnet_to_mmdet.py ${DiffusionDet ckpt path} ${MMDetectron ckpt path} - ``` - -3. Testing the model in MMDetection. - - ```shell - python tools/test.py projects/DiffusionDet/configs/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco.py ${CHECKPOINT_PATH} - ``` - -**Note:** During inference time, DiffusionDet will randomly generate noisy boxes, -which may affect the AP results. If users want to get the same result every inference time, setting seed is a good way. -We give a table to compare the inference results on `ResNet50-500-proposals` between DiffusionDet and MMDetection. - -| Config | Step | AP | -| :---------------------------------------------------------------------------------------------------------------------: | :--: | :-------: | -| [DiffusionDet](https://github.com/ShoufaChen/DiffusionDet/blob/main/configs/diffdet.coco.res50.yaml) (released results) | 1 | 45.5 | -| [DiffusionDet](https://github.com/ShoufaChen/DiffusionDet/blob/main/configs/diffdet.coco.res50.yaml) (seed=0) | 1 | 45.66 | -| [MMDetection](configs/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco.py) (seed=0) | 1 | 45.7 | -| [MMDetection](configs/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco.py) (random seed) | 1 | 45.6~45.8 | -| [DiffusionDet](https://github.com/ShoufaChen/DiffusionDet/blob/main/configs/diffdet.coco.res50.yaml) (released results) | 4 | 46.1 | -| [DiffusionDet](https://github.com/ShoufaChen/DiffusionDet/blob/main/configs/diffdet.coco.res50.yaml) (seed=0) | 4 | 46.38 | -| [MMDetection](configs/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco.py) (seed=0) | 4 | 46.4 | -| [MMDetection](configs/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco.py) (random seed) | 4 | 46.2~46.4 | - -- `seed=0` means hard set seed before generating random boxes. - ```python - # hard set seed=0 before generating random boxes - seed = 0 - random.seed(seed) - torch.manual_seed(seed) - # torch.cuda.manual_seed(seed) - torch.cuda.manual_seed_all(seed) - ... - noise_bboxes_raw = torch.randn( - (self.num_proposals, 4), - device=device) - ... - ``` -- `random seed` means do not hard set seed before generating random boxes. - -### Training commands - -In MMDetection's root directory, run the following command to train the model: - -```bash -python tools/train.py projects/DiffusionDet/configs/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco.py -``` - -For multi-gpu training, run: - -```bash -python -m torch.distributed.launch --nnodes=1 --node_rank=0 --nproc_per_node=${NUM_GPUS} --master_port=29506 --master_addr="127.0.0.1" tools/train.py projects/DiffusionDet/configs/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco.py -``` - -### Testing commands - -In MMDetection's root directory, run the following command to test the model: - -```bash -# for 1 step inference -# test command -python tools/test.py projects/DiffusionDet/configs/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco.py ${CHECKPOINT_PATH} - -# for 4 steps inference - -# test command -python tools/test.py projects/DiffusionDet/configs/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco.py ${CHECKPOINT_PATH} --cfg-options model.bbox_head.sampling_timesteps=4 -``` - -**Note:** There is no difference between 1 step or 4 steps (or other multi-step) during training. Users can set different steps during inference through `--cfg-options model.bbox_head.sampling_timesteps=${STEPS}`, but larger `sampling_timesteps` will affect the inference time. - -## Results - -Here we provide the baseline version of DiffusionDet with ResNet50 backbone. - -To find more variants, please visit the [official model zoo](https://github.com/ShoufaChen/DiffusionDet#models). - -| Backbone | Style | Lr schd | AP (Step=1) | AP (Step=4) | Config | Download | -| :------: | :-----: | :-----: | :---------: | :---------: | :----------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| R-50 | PyTorch | 450k | 44.5 | 46.2 | [config](./configs/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/diffusiondet/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco_20230215_090925-7d6ed504.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/diffusiondet/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco_20230215_090925.log.json) | - -## License - -DiffusionDet is under the [CC-BY-NC 4.0 license](https://github.com/ShoufaChen/DiffusionDet/blob/main/LICENSE). Users should be careful about adopting these features in any commercial matters. - -## Citation - -If you find DiffusionDet is useful in your research or applications, please consider giving a star 🌟 to the [official repository](https://github.com/ShoufaChen/DiffusionDet) and citing DiffusionDet by the following BibTeX entry. - -```BibTeX -@article{chen2022diffusiondet, - title={DiffusionDet: Diffusion Model for Object Detection}, - author={Chen, Shoufa and Sun, Peize and Song, Yibing and Luo, Ping}, - journal={arXiv preprint arXiv:2211.09788}, - year={2022} -} -``` - -## Checklist - - - -- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`. - - - [x] Finish the code - - - - - [x] Basic docstrings & proper citation - - - - - [x] Test-time correctness - - - - - [x] A full README - - - -- [x] Milestone 2: Indicates a successful model implementation. - - - [x] Training-time correctness - - - -- [ ] Milestone 3: Good to be a part of our core package! - - - [ ] Type hints and docstrings - - - - - [ ] Unit tests - - - - - [ ] Code polishing - - - - - [ ] Metafile.yml - - - -- [ ] Move your modules into the core package following the codebase's file hierarchy structure. - - - -- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure. diff --git a/projects/DiffusionDet/configs/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco.py b/projects/DiffusionDet/configs/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco.py deleted file mode 100644 index 187cdc397..000000000 --- a/projects/DiffusionDet/configs/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco.py +++ /dev/null @@ -1,185 +0,0 @@ -_base_ = [ - 'mmdet::_base_/datasets/coco_detection.py', - 'mmdet::_base_/schedules/schedule_1x.py', - 'mmdet::_base_/default_runtime.py' -] - -custom_imports = dict( - imports=['projects.DiffusionDet.diffusiondet'], allow_failed_imports=False) - -# model settings -model = dict( - type='DiffusionDet', - data_preprocessor=dict( - type='DetDataPreprocessor', - mean=[123.675, 116.28, 103.53], - std=[58.395, 57.12, 57.375], - bgr_to_rgb=True, - pad_size_divisor=32), - backbone=dict( - type='ResNet', - depth=50, - num_stages=4, - out_indices=(0, 1, 2, 3), - frozen_stages=1, - norm_cfg=dict(type='BN', requires_grad=True), - norm_eval=True, - style='pytorch', - init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), - neck=dict( - type='FPN', - in_channels=[256, 512, 1024, 2048], - out_channels=256, - num_outs=4), - bbox_head=dict( - type='DynamicDiffusionDetHead', - num_classes=80, - feat_channels=256, - num_proposals=500, - num_heads=6, - deep_supervision=True, - prior_prob=0.01, - snr_scale=2.0, - sampling_timesteps=1, - ddim_sampling_eta=1.0, - single_head=dict( - type='SingleDiffusionDetHead', - num_cls_convs=1, - num_reg_convs=3, - dim_feedforward=2048, - num_heads=8, - dropout=0.0, - act_cfg=dict(type='ReLU', inplace=True), - dynamic_conv=dict(dynamic_dim=64, dynamic_num=2)), - roi_extractor=dict( - type='SingleRoIExtractor', - roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2), - out_channels=256, - featmap_strides=[4, 8, 16, 32]), - # criterion - criterion=dict( - type='DiffusionDetCriterion', - num_classes=80, - assigner=dict( - type='DiffusionDetMatcher', - match_costs=[ - dict( - type='FocalLossCost', - alpha=0.25, - gamma=2.0, - weight=2.0, - eps=1e-8), - dict(type='BBoxL1Cost', weight=5.0, box_format='xyxy'), - dict(type='IoUCost', iou_mode='giou', weight=2.0) - ], - center_radius=2.5, - candidate_topk=5), - loss_cls=dict( - type='FocalLoss', - use_sigmoid=True, - alpha=0.25, - gamma=2.0, - reduction='sum', - loss_weight=2.0), - loss_bbox=dict(type='L1Loss', reduction='sum', loss_weight=5.0), - loss_giou=dict(type='GIoULoss', reduction='sum', - loss_weight=2.0))), - test_cfg=dict( - use_nms=True, - score_thr=0.5, - min_bbox_size=0, - nms=dict(type='nms', iou_threshold=0.5), - )) - -backend = 'pillow' -train_pipeline = [ - dict( - type='LoadImageFromFile', - backend_args=_base_.backend_args, - imdecode_backend=backend), - dict(type='LoadAnnotations', with_bbox=True), - dict(type='RandomFlip', prob=0.5), - dict( - type='RandomChoice', - transforms=[[ - dict( - type='RandomChoiceResize', - scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), - (608, 1333), (640, 1333), (672, 1333), (704, 1333), - (736, 1333), (768, 1333), (800, 1333)], - keep_ratio=True, - backend=backend), - ], - [ - dict( - type='RandomChoiceResize', - scales=[(400, 1333), (500, 1333), (600, 1333)], - keep_ratio=True, - backend=backend), - dict( - type='RandomCrop', - crop_type='absolute_range', - crop_size=(384, 600), - allow_negative_crop=True), - dict( - type='RandomChoiceResize', - scales=[(480, 1333), (512, 1333), (544, 1333), - (576, 1333), (608, 1333), (640, 1333), - (672, 1333), (704, 1333), (736, 1333), - (768, 1333), (800, 1333)], - keep_ratio=True, - backend=backend) - ]]), - dict(type='PackDetInputs') -] - -test_pipeline = [ - dict( - type='LoadImageFromFile', - backend_args=_base_.backend_args, - imdecode_backend=backend), - dict(type='Resize', scale=(1333, 800), keep_ratio=True, backend=backend), - # If you don't have a gt annotation, delete the pipeline - dict(type='LoadAnnotations', with_bbox=True), - dict( - type='PackDetInputs', - meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', - 'scale_factor')) -] -train_dataloader = dict( - sampler=dict(type='InfiniteSampler'), - dataset=dict( - filter_cfg=dict(filter_empty_gt=False, min_size=1e-5), - pipeline=train_pipeline)) - -val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) -test_dataloader = val_dataloader - -# optimizer -optim_wrapper = dict( - type='OptimWrapper', - optimizer=dict( - _delete_=True, type='AdamW', lr=0.000025, weight_decay=0.0001), - clip_grad=dict(max_norm=1.0, norm_type=2)) -train_cfg = dict( - _delete_=True, - type='IterBasedTrainLoop', - max_iters=450000, - val_interval=75000) - -# learning rate -param_scheduler = [ - dict( - type='LinearLR', start_factor=0.01, by_epoch=False, begin=0, end=1000), - dict( - type='MultiStepLR', - begin=0, - end=450000, - by_epoch=False, - milestones=[350000, 420000], - gamma=0.1) -] - -default_hooks = dict( - checkpoint=dict(by_epoch=False, interval=75000, max_keep_ckpts=3)) -log_processor = dict(by_epoch=False) diff --git a/projects/DiffusionDet/diffusiondet/__init__.py b/projects/DiffusionDet/diffusiondet/__init__.py deleted file mode 100644 index 35d603220..000000000 --- a/projects/DiffusionDet/diffusiondet/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -from .diffusiondet import DiffusionDet -from .head import (DynamicConv, DynamicDiffusionDetHead, - SingleDiffusionDetHead, SinusoidalPositionEmbeddings) -from .loss import DiffusionDetCriterion, DiffusionDetMatcher - -__all__ = [ - 'DiffusionDet', 'DynamicDiffusionDetHead', 'SingleDiffusionDetHead', - 'SinusoidalPositionEmbeddings', 'DynamicConv', 'DiffusionDetCriterion', - 'DiffusionDetMatcher' -] diff --git a/projects/DiffusionDet/diffusiondet/diffusiondet.py b/projects/DiffusionDet/diffusiondet/diffusiondet.py deleted file mode 100644 index 5a46ddf76..000000000 --- a/projects/DiffusionDet/diffusiondet/diffusiondet.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from mmdet.models import SingleStageDetector -from mmdet.registry import MODELS -from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig - - -@MODELS.register_module() -class DiffusionDet(SingleStageDetector): - """Implementation of `DiffusionDet <>`_""" - - def __init__(self, - backbone: ConfigType, - neck: ConfigType, - bbox_head: ConfigType, - train_cfg: OptConfigType = None, - test_cfg: OptConfigType = None, - data_preprocessor: OptConfigType = None, - init_cfg: OptMultiConfig = None) -> None: - super().__init__( - backbone=backbone, - neck=neck, - bbox_head=bbox_head, - train_cfg=train_cfg, - test_cfg=test_cfg, - data_preprocessor=data_preprocessor, - init_cfg=init_cfg) diff --git a/projects/DiffusionDet/diffusiondet/head.py b/projects/DiffusionDet/diffusiondet/head.py deleted file mode 100644 index 794c9c9f6..000000000 --- a/projects/DiffusionDet/diffusiondet/head.py +++ /dev/null @@ -1,1034 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. - -# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved -# Modified from https://github.com/ShoufaChen/DiffusionDet/blob/main/diffusiondet/detector.py # noqa -# Modified from https://github.com/ShoufaChen/DiffusionDet/blob/main/diffusiondet/head.py # noqa - -# This work is licensed under the CC-BY-NC 4.0 License. -# Users should be careful about adopting these features in any commercial matters. # noqa -# For more details, please refer to https://github.com/ShoufaChen/DiffusionDet/blob/main/LICENSE # noqa - -import copy -import math -import random -import warnings -from typing import Tuple - -import torch -import torch.nn as nn -import torch.nn.functional as F -from mmcv.cnn import build_activation_layer -from mmcv.ops import batched_nms -from mmengine.structures import InstanceData -from torch import Tensor - -from mmdet.registry import MODELS, TASK_UTILS -from mmdet.structures import SampleList -from mmdet.structures.bbox import (bbox2roi, bbox_cxcywh_to_xyxy, - bbox_xyxy_to_cxcywh, get_box_wh, - scale_boxes) -from mmdet.utils import InstanceList - -_DEFAULT_SCALE_CLAMP = math.log(100000.0 / 16) - - -def cosine_beta_schedule(timesteps, s=0.008): - """Cosine schedule as proposed in - https://openreview.net/forum?id=-NEXDKk8gZ.""" - steps = timesteps + 1 - x = torch.linspace(0, timesteps, steps, dtype=torch.float64) - alphas_cumprod = torch.cos( - ((x / timesteps) + s) / (1 + s) * math.pi * 0.5)**2 - alphas_cumprod = alphas_cumprod / alphas_cumprod[0] - betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1]) - return torch.clip(betas, 0, 0.999) - - -def extract(a, t, x_shape): - """extract the appropriate t index for a batch of indices.""" - batch_size = t.shape[0] - out = a.gather(-1, t) - return out.reshape(batch_size, *((1, ) * (len(x_shape) - 1))) - - -class SinusoidalPositionEmbeddings(nn.Module): - - def __init__(self, dim): - super().__init__() - self.dim = dim - - def forward(self, time): - device = time.device - half_dim = self.dim // 2 - embeddings = math.log(10000) / (half_dim - 1) - embeddings = torch.exp( - torch.arange(half_dim, device=device) * -embeddings) - embeddings = time[:, None] * embeddings[None, :] - embeddings = torch.cat((embeddings.sin(), embeddings.cos()), dim=-1) - return embeddings - - -@MODELS.register_module() -class DynamicDiffusionDetHead(nn.Module): - - def __init__(self, - num_classes=80, - feat_channels=256, - num_proposals=500, - num_heads=6, - prior_prob=0.01, - snr_scale=2.0, - timesteps=1000, - sampling_timesteps=1, - self_condition=False, - box_renewal=True, - use_ensemble=True, - deep_supervision=True, - ddim_sampling_eta=1.0, - criterion=dict( - type='DiffusionDetCriterion', - num_classes=80, - assigner=dict( - type='DiffusionDetMatcher', - match_costs=[ - dict( - type='FocalLossCost', - alpha=2.0, - gamma=0.25, - weight=2.0), - dict( - type='BBoxL1Cost', - weight=5.0, - box_format='xyxy'), - dict(type='IoUCost', iou_mode='giou', weight=2.0) - ], - center_radius=2.5, - candidate_topk=5), - ), - single_head=dict( - type='DiffusionDetHead', - num_cls_convs=1, - num_reg_convs=3, - dim_feedforward=2048, - num_heads=8, - dropout=0.0, - act_cfg=dict(type='ReLU'), - dynamic_conv=dict(dynamic_dim=64, dynamic_num=2)), - roi_extractor=dict( - type='SingleRoIExtractor', - roi_layer=dict( - type='RoIAlign', output_size=7, sampling_ratio=2), - out_channels=256, - featmap_strides=[4, 8, 16, 32]), - test_cfg=None, - **kwargs) -> None: - super().__init__() - self.roi_extractor = MODELS.build(roi_extractor) - - self.num_classes = num_classes - self.num_classes = num_classes - self.feat_channels = feat_channels - self.num_proposals = num_proposals - self.num_heads = num_heads - # Build Diffusion - assert isinstance(timesteps, int), 'The type of `timesteps` should ' \ - f'be int but got {type(timesteps)}' - assert sampling_timesteps <= timesteps - self.timesteps = timesteps - self.sampling_timesteps = sampling_timesteps - self.snr_scale = snr_scale - - self.ddim_sampling = self.sampling_timesteps < self.timesteps - self.ddim_sampling_eta = ddim_sampling_eta - self.self_condition = self_condition - self.box_renewal = box_renewal - self.use_ensemble = use_ensemble - - self._build_diffusion() - - # Build assigner - assert criterion.get('assigner', None) is not None - assigner = TASK_UTILS.build(criterion.get('assigner')) - # Init parameters. - self.use_focal_loss = assigner.use_focal_loss - self.use_fed_loss = assigner.use_fed_loss - - # build criterion - criterion.update(deep_supervision=deep_supervision) - self.criterion = TASK_UTILS.build(criterion) - - # Build Dynamic Head. - single_head_ = single_head.copy() - single_head_num_classes = single_head_.get('num_classes', None) - if single_head_num_classes is None: - single_head_.update(num_classes=num_classes) - else: - if single_head_num_classes != num_classes: - warnings.warn( - 'The `num_classes` of `DynamicDiffusionDetHead` and ' - '`SingleDiffusionDetHead` should be same, changing ' - f'`single_head.num_classes` to {num_classes}') - single_head_.update(num_classes=num_classes) - - single_head_feat_channels = single_head_.get('feat_channels', None) - if single_head_feat_channels is None: - single_head_.update(feat_channels=feat_channels) - else: - if single_head_feat_channels != feat_channels: - warnings.warn( - 'The `feat_channels` of `DynamicDiffusionDetHead` and ' - '`SingleDiffusionDetHead` should be same, changing ' - f'`single_head.feat_channels` to {feat_channels}') - single_head_.update(feat_channels=feat_channels) - - default_pooler_resolution = roi_extractor['roi_layer'].get( - 'output_size') - assert default_pooler_resolution is not None - single_head_pooler_resolution = single_head_.get('pooler_resolution') - if single_head_pooler_resolution is None: - single_head_.update(pooler_resolution=default_pooler_resolution) - else: - if single_head_pooler_resolution != default_pooler_resolution: - warnings.warn( - 'The `pooler_resolution` of `DynamicDiffusionDetHead` ' - 'and `SingleDiffusionDetHead` should be same, changing ' - f'`single_head.pooler_resolution` to {num_classes}') - single_head_.update( - pooler_resolution=default_pooler_resolution) - - single_head_.update( - use_focal_loss=self.use_focal_loss, use_fed_loss=self.use_fed_loss) - single_head_module = MODELS.build(single_head_) - - self.num_heads = num_heads - self.head_series = nn.ModuleList( - [copy.deepcopy(single_head_module) for _ in range(num_heads)]) - - self.deep_supervision = deep_supervision - - # Gaussian random feature embedding layer for time - time_dim = feat_channels * 4 - self.time_mlp = nn.Sequential( - SinusoidalPositionEmbeddings(feat_channels), - nn.Linear(feat_channels, time_dim), nn.GELU(), - nn.Linear(time_dim, time_dim)) - - self.prior_prob = prior_prob - self.test_cfg = test_cfg - self.use_nms = self.test_cfg.get('use_nms', True) - self._init_weights() - - def _init_weights(self): - # init all parameters. - bias_value = -math.log((1 - self.prior_prob) / self.prior_prob) - for p in self.parameters(): - if p.dim() > 1: - nn.init.xavier_uniform_(p) - - # initialize the bias for focal loss and fed loss. - if self.use_focal_loss or self.use_fed_loss: - if p.shape[-1] == self.num_classes or \ - p.shape[-1] == self.num_classes + 1: - nn.init.constant_(p, bias_value) - - def _build_diffusion(self): - betas = cosine_beta_schedule(self.timesteps) - alphas = 1. - betas - alphas_cumprod = torch.cumprod(alphas, dim=0) - alphas_cumprod_prev = F.pad(alphas_cumprod[:-1], (1, 0), value=1.) - - self.register_buffer('betas', betas) - self.register_buffer('alphas_cumprod', alphas_cumprod) - self.register_buffer('alphas_cumprod_prev', alphas_cumprod_prev) - - # calculations for diffusion q(x_t | x_{t-1}) and others - self.register_buffer('sqrt_alphas_cumprod', torch.sqrt(alphas_cumprod)) - self.register_buffer('sqrt_one_minus_alphas_cumprod', - torch.sqrt(1. - alphas_cumprod)) - self.register_buffer('log_one_minus_alphas_cumprod', - torch.log(1. - alphas_cumprod)) - self.register_buffer('sqrt_recip_alphas_cumprod', - torch.sqrt(1. / alphas_cumprod)) - self.register_buffer('sqrt_recipm1_alphas_cumprod', - torch.sqrt(1. / alphas_cumprod - 1)) - - # calculations for posterior q(x_{t-1} | x_t, x_0) - # equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t) - posterior_variance = betas * (1. - alphas_cumprod_prev) / ( - 1. - alphas_cumprod) - self.register_buffer('posterior_variance', posterior_variance) - - # log calculation clipped because the posterior variance is 0 at - # the beginning of the diffusion chain - self.register_buffer('posterior_log_variance_clipped', - torch.log(posterior_variance.clamp(min=1e-20))) - self.register_buffer( - 'posterior_mean_coef1', - betas * torch.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod)) - self.register_buffer('posterior_mean_coef2', - (1. - alphas_cumprod_prev) * torch.sqrt(alphas) / - (1. - alphas_cumprod)) - - def forward(self, features, init_bboxes, init_t, init_features=None): - time = self.time_mlp(init_t, ) - - inter_class_logits = [] - inter_pred_bboxes = [] - - bs = len(features[0]) - bboxes = init_bboxes - - if init_features is not None: - init_features = init_features[None].repeat(1, bs, 1) - proposal_features = init_features.clone() - else: - proposal_features = None - - for head_idx, single_head in enumerate(self.head_series): - class_logits, pred_bboxes, proposal_features = single_head( - features, bboxes, proposal_features, self.roi_extractor, time) - if self.deep_supervision: - inter_class_logits.append(class_logits) - inter_pred_bboxes.append(pred_bboxes) - bboxes = pred_bboxes.detach() - - if self.deep_supervision: - return torch.stack(inter_class_logits), torch.stack( - inter_pred_bboxes) - else: - return class_logits[None, ...], pred_bboxes[None, ...] - - def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList) -> dict: - """Perform forward propagation and loss calculation of the detection - head on the features of the upstream network. - - Args: - x (tuple[Tensor]): Features from the upstream network, each is - a 4D-tensor. - batch_data_samples (List[:obj:`DetDataSample`]): The Data - Samples. It usually includes information such as - `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. - - Returns: - dict: A dictionary of loss components. - """ - prepare_outputs = self.prepare_training_targets(batch_data_samples) - (batch_gt_instances, batch_pred_instances, batch_gt_instances_ignore, - batch_img_metas) = prepare_outputs - - batch_diff_bboxes = torch.stack([ - pred_instances.diff_bboxes_abs - for pred_instances in batch_pred_instances - ]) - batch_time = torch.stack( - [pred_instances.time for pred_instances in batch_pred_instances]) - - pred_logits, pred_bboxes = self(x, batch_diff_bboxes, batch_time) - - output = { - 'pred_logits': pred_logits[-1], - 'pred_boxes': pred_bboxes[-1] - } - if self.deep_supervision: - output['aux_outputs'] = [{ - 'pred_logits': a, - 'pred_boxes': b - } for a, b in zip(pred_logits[:-1], pred_bboxes[:-1])] - - losses = self.criterion(output, batch_gt_instances, batch_img_metas) - return losses - - def prepare_training_targets(self, batch_data_samples): - # hard-setting seed to keep results same (if necessary) - # random.seed(0) - # torch.manual_seed(0) - # torch.cuda.manual_seed_all(0) - # torch.backends.cudnn.deterministic = True - # torch.backends.cudnn.benchmark = False - - batch_gt_instances = [] - batch_pred_instances = [] - batch_gt_instances_ignore = [] - batch_img_metas = [] - for data_sample in batch_data_samples: - img_meta = data_sample.metainfo - gt_instances = data_sample.gt_instances - - gt_bboxes = gt_instances.bboxes - h, w = img_meta['img_shape'] - image_size = gt_bboxes.new_tensor([w, h, w, h]) - - norm_gt_bboxes = gt_bboxes / image_size - norm_gt_bboxes_cxcywh = bbox_xyxy_to_cxcywh(norm_gt_bboxes) - pred_instances = self.prepare_diffusion(norm_gt_bboxes_cxcywh, - image_size) - - gt_instances.set_metainfo(dict(image_size=image_size)) - gt_instances.norm_bboxes_cxcywh = norm_gt_bboxes_cxcywh - - batch_gt_instances.append(gt_instances) - batch_pred_instances.append(pred_instances) - batch_img_metas.append(data_sample.metainfo) - if 'ignored_instances' in data_sample: - batch_gt_instances_ignore.append(data_sample.ignored_instances) - else: - batch_gt_instances_ignore.append(None) - return (batch_gt_instances, batch_pred_instances, - batch_gt_instances_ignore, batch_img_metas) - - def prepare_diffusion(self, gt_boxes, image_size): - device = gt_boxes.device - time = torch.randint( - 0, self.timesteps, (1, ), dtype=torch.long, device=device) - noise = torch.randn(self.num_proposals, 4, device=device) - - num_gt = gt_boxes.shape[0] - if num_gt < self.num_proposals: - # 3 * sigma = 1/2 --> sigma: 1/6 - box_placeholder = torch.randn( - self.num_proposals - num_gt, 4, device=device) / 6. + 0.5 - box_placeholder[:, 2:] = torch.clip( - box_placeholder[:, 2:], min=1e-4) - x_start = torch.cat((gt_boxes, box_placeholder), dim=0) - else: - select_mask = [True] * self.num_proposals + \ - [False] * (num_gt - self.num_proposals) - random.shuffle(select_mask) - x_start = gt_boxes[select_mask] - - x_start = (x_start * 2. - 1.) * self.snr_scale - - # noise sample - x = self.q_sample(x_start=x_start, time=time, noise=noise) - - x = torch.clamp(x, min=-1 * self.snr_scale, max=self.snr_scale) - x = ((x / self.snr_scale) + 1) / 2. - - diff_bboxes = bbox_cxcywh_to_xyxy(x) - # convert to abs bboxes - diff_bboxes_abs = diff_bboxes * image_size - - metainfo = dict(time=time.squeeze(-1)) - pred_instances = InstanceData(metainfo=metainfo) - pred_instances.diff_bboxes = diff_bboxes - pred_instances.diff_bboxes_abs = diff_bboxes_abs - pred_instances.noise = noise - return pred_instances - - # forward diffusion - def q_sample(self, x_start, time, noise=None): - if noise is None: - noise = torch.randn_like(x_start) - - x_start_shape = x_start.shape - - sqrt_alphas_cumprod_t = extract(self.sqrt_alphas_cumprod, time, - x_start_shape) - sqrt_one_minus_alphas_cumprod_t = extract( - self.sqrt_one_minus_alphas_cumprod, time, x_start_shape) - - return sqrt_alphas_cumprod_t * x_start + \ - sqrt_one_minus_alphas_cumprod_t * noise - - def predict(self, - x: Tuple[Tensor], - batch_data_samples: SampleList, - rescale: bool = False) -> InstanceList: - """Perform forward propagation of the detection head and predict - detection results on the features of the upstream network. - - Args: - x (tuple[Tensor]): Multi-level features from the - upstream network, each is a 4D-tensor. - batch_data_samples (List[:obj:`DetDataSample`]): The Data - Samples. It usually includes information such as - `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. - rescale (bool, optional): Whether to rescale the results. - Defaults to False. - - Returns: - list[obj:`InstanceData`]: Detection results of each image - after the post process. - """ - # hard-setting seed to keep results same (if necessary) - # seed = 0 - # random.seed(seed) - # torch.manual_seed(seed) - # torch.cuda.manual_seed_all(seed) - - device = x[-1].device - - batch_img_metas = [ - data_samples.metainfo for data_samples in batch_data_samples - ] - - (time_pairs, batch_noise_bboxes, batch_noise_bboxes_raw, - batch_image_size) = self.prepare_testing_targets( - batch_img_metas, device) - - predictions = self.predict_by_feat( - x, - time_pairs=time_pairs, - batch_noise_bboxes=batch_noise_bboxes, - batch_noise_bboxes_raw=batch_noise_bboxes_raw, - batch_image_size=batch_image_size, - device=device, - batch_img_metas=batch_img_metas) - return predictions - - def predict_by_feat(self, - x, - time_pairs, - batch_noise_bboxes, - batch_noise_bboxes_raw, - batch_image_size, - device, - batch_img_metas=None, - cfg=None, - rescale=True): - - batch_size = len(batch_img_metas) - - cfg = self.test_cfg if cfg is None else cfg - cfg = copy.deepcopy(cfg) - - ensemble_score, ensemble_label, ensemble_coord = [], [], [] - for time, time_next in time_pairs: - batch_time = torch.full((batch_size, ), - time, - device=device, - dtype=torch.long) - # self_condition = x_start if self.self_condition else None - pred_logits, pred_bboxes = self(x, batch_noise_bboxes, batch_time) - - x_start = pred_bboxes[-1] - - x_start = x_start / batch_image_size[:, None, :] - x_start = bbox_xyxy_to_cxcywh(x_start) - x_start = (x_start * 2 - 1.) * self.snr_scale - x_start = torch.clamp( - x_start, min=-1 * self.snr_scale, max=self.snr_scale) - pred_noise = self.predict_noise_from_start(batch_noise_bboxes_raw, - batch_time, x_start) - pred_noise_list, x_start_list = [], [] - noise_bboxes_list, num_remain_list = [], [] - if self.box_renewal: # filter - score_thr = cfg.get('score_thr', 0) - for img_id in range(batch_size): - score_per_image = pred_logits[-1][img_id] - - score_per_image = torch.sigmoid(score_per_image) - value, _ = torch.max(score_per_image, -1, keepdim=False) - keep_idx = value > score_thr - - num_remain_list.append(torch.sum(keep_idx)) - pred_noise_list.append(pred_noise[img_id, keep_idx, :]) - x_start_list.append(x_start[img_id, keep_idx, :]) - noise_bboxes_list.append(batch_noise_bboxes[img_id, - keep_idx, :]) - if time_next < 0: - # Not same as original DiffusionDet - if self.use_ensemble and self.sampling_timesteps > 1: - box_pred_per_image, scores_per_image, labels_per_image = \ - self.inference( - box_cls=pred_logits[-1], - box_pred=pred_bboxes[-1], - cfg=cfg, - device=device) - ensemble_score.append(scores_per_image) - ensemble_label.append(labels_per_image) - ensemble_coord.append(box_pred_per_image) - continue - - alpha = self.alphas_cumprod[time] - alpha_next = self.alphas_cumprod[time_next] - - sigma = self.ddim_sampling_eta * ((1 - alpha / alpha_next) * - (1 - alpha_next) / - (1 - alpha)).sqrt() - c = (1 - alpha_next - sigma**2).sqrt() - - batch_noise_bboxes_list = [] - batch_noise_bboxes_raw_list = [] - for idx in range(batch_size): - pred_noise = pred_noise_list[idx] - x_start = x_start_list[idx] - noise_bboxes = noise_bboxes_list[idx] - num_remain = num_remain_list[idx] - noise = torch.randn_like(noise_bboxes) - - noise_bboxes = x_start * alpha_next.sqrt() + \ - c * pred_noise + sigma * noise - - if self.box_renewal: # filter - # replenish with randn boxes - if num_remain < self.num_proposals: - noise_bboxes = torch.cat( - (noise_bboxes, - torch.randn( - self.num_proposals - num_remain, - 4, - device=device)), - dim=0) - else: - select_mask = [True] * self.num_proposals + \ - [False] * (num_remain - - self.num_proposals) - random.shuffle(select_mask) - noise_bboxes = noise_bboxes[select_mask] - - # raw noise boxes - batch_noise_bboxes_raw_list.append(noise_bboxes) - # resize to xyxy - noise_bboxes = torch.clamp( - noise_bboxes, - min=-1 * self.snr_scale, - max=self.snr_scale) - noise_bboxes = ((noise_bboxes / self.snr_scale) + 1) / 2 - noise_bboxes = bbox_cxcywh_to_xyxy(noise_bboxes) - noise_bboxes = noise_bboxes * batch_image_size[idx] - - batch_noise_bboxes_list.append(noise_bboxes) - batch_noise_bboxes = torch.stack(batch_noise_bboxes_list) - batch_noise_bboxes_raw = torch.stack(batch_noise_bboxes_raw_list) - if self.use_ensemble and self.sampling_timesteps > 1: - box_pred_per_image, scores_per_image, labels_per_image = \ - self.inference( - box_cls=pred_logits[-1], - box_pred=pred_bboxes[-1], - cfg=cfg, - device=device) - ensemble_score.append(scores_per_image) - ensemble_label.append(labels_per_image) - ensemble_coord.append(box_pred_per_image) - if self.use_ensemble and self.sampling_timesteps > 1: - steps = len(ensemble_score) - results_list = [] - for idx in range(batch_size): - ensemble_score_per_img = [ - ensemble_score[i][idx] for i in range(steps) - ] - ensemble_label_per_img = [ - ensemble_label[i][idx] for i in range(steps) - ] - ensemble_coord_per_img = [ - ensemble_coord[i][idx] for i in range(steps) - ] - - scores_per_image = torch.cat(ensemble_score_per_img, dim=0) - labels_per_image = torch.cat(ensemble_label_per_img, dim=0) - box_pred_per_image = torch.cat(ensemble_coord_per_img, dim=0) - - if self.use_nms: - det_bboxes, keep_idxs = batched_nms( - box_pred_per_image, scores_per_image, labels_per_image, - cfg.nms) - box_pred_per_image = box_pred_per_image[keep_idxs] - labels_per_image = labels_per_image[keep_idxs] - scores_per_image = det_bboxes[:, -1] - results = InstanceData() - results.bboxes = box_pred_per_image - results.scores = scores_per_image - results.labels = labels_per_image - results_list.append(results) - else: - box_cls = pred_logits[-1] - box_pred = pred_bboxes[-1] - results_list = self.inference(box_cls, box_pred, cfg, device) - if rescale: - results_list = self.do_results_post_process( - results_list, cfg, batch_img_metas=batch_img_metas) - return results_list - - @staticmethod - def do_results_post_process(results_list, cfg, batch_img_metas=None): - processed_results = [] - for results, img_meta in zip(results_list, batch_img_metas): - assert img_meta.get('scale_factor') is not None - scale_factor = [1 / s for s in img_meta['scale_factor']] - results.bboxes = scale_boxes(results.bboxes, scale_factor) - # clip w, h - h, w = img_meta['ori_shape'] - results.bboxes[:, 0::2] = results.bboxes[:, 0::2].clamp( - min=0, max=w) - results.bboxes[:, 1::2] = results.bboxes[:, 1::2].clamp( - min=0, max=h) - - # filter small size bboxes - if cfg.get('min_bbox_size', 0) >= 0: - w, h = get_box_wh(results.bboxes) - valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size) - if not valid_mask.all(): - results = results[valid_mask] - processed_results.append(results) - - return processed_results - - def prepare_testing_targets(self, batch_img_metas, device): - # [-1, 0, 1, 2, ..., T-1] when sampling_timesteps == timesteps - times = torch.linspace( - -1, self.timesteps - 1, steps=self.sampling_timesteps + 1) - times = list(reversed(times.int().tolist())) - # [(T-1, T-2), (T-2, T-3), ..., (1, 0), (0, -1)] - time_pairs = list(zip(times[:-1], times[1:])) - - noise_bboxes_list = [] - noise_bboxes_raw_list = [] - image_size_list = [] - for img_meta in batch_img_metas: - h, w = img_meta['img_shape'] - image_size = torch.tensor([w, h, w, h], - dtype=torch.float32, - device=device) - noise_bboxes_raw = torch.randn((self.num_proposals, 4), - device=device) - noise_bboxes = torch.clamp( - noise_bboxes_raw, min=-1 * self.snr_scale, max=self.snr_scale) - noise_bboxes = ((noise_bboxes / self.snr_scale) + 1) / 2 - noise_bboxes = bbox_cxcywh_to_xyxy(noise_bboxes) - noise_bboxes = noise_bboxes * image_size - - noise_bboxes_raw_list.append(noise_bboxes_raw) - noise_bboxes_list.append(noise_bboxes) - image_size_list.append(image_size[None]) - batch_noise_bboxes = torch.stack(noise_bboxes_list) - batch_image_size = torch.cat(image_size_list) - batch_noise_bboxes_raw = torch.stack(noise_bboxes_raw_list) - return (time_pairs, batch_noise_bboxes, batch_noise_bboxes_raw, - batch_image_size) - - def predict_noise_from_start(self, x_t, t, x0): - results = (extract( - self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - x0) / \ - extract(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) - return results - - def inference(self, box_cls, box_pred, cfg, device): - """ - Args: - box_cls (Tensor): tensor of shape (batch_size, num_proposals, K). - The tensor predicts the classification probability for - each proposal. - box_pred (Tensor): tensors of shape (batch_size, num_proposals, 4). - The tensor predicts 4-vector (x,y,w,h) box - regression values for every proposal - - Returns: - results (List[Instances]): a list of #images elements. - """ - results = [] - - if self.use_focal_loss or self.use_fed_loss: - scores = torch.sigmoid(box_cls) - labels = torch.arange( - self.num_classes, - device=device).unsqueeze(0).repeat(self.num_proposals, - 1).flatten(0, 1) - box_pred_list = [] - scores_list = [] - labels_list = [] - for i, (scores_per_image, - box_pred_per_image) in enumerate(zip(scores, box_pred)): - - scores_per_image, topk_indices = scores_per_image.flatten( - 0, 1).topk( - self.num_proposals, sorted=False) - labels_per_image = labels[topk_indices] - box_pred_per_image = box_pred_per_image.view(-1, 1, 4).repeat( - 1, self.num_classes, 1).view(-1, 4) - box_pred_per_image = box_pred_per_image[topk_indices] - - if self.use_ensemble and self.sampling_timesteps > 1: - box_pred_list.append(box_pred_per_image) - scores_list.append(scores_per_image) - labels_list.append(labels_per_image) - continue - - if self.use_nms: - det_bboxes, keep_idxs = batched_nms( - box_pred_per_image, scores_per_image, labels_per_image, - cfg.nms) - box_pred_per_image = box_pred_per_image[keep_idxs] - labels_per_image = labels_per_image[keep_idxs] - # some nms would reweight the score, such as softnms - scores_per_image = det_bboxes[:, -1] - result = InstanceData() - result.bboxes = box_pred_per_image - result.scores = scores_per_image - result.labels = labels_per_image - results.append(result) - - else: - # For each box we assign the best class or the second - # best if the best on is `no_object`. - scores, labels = F.softmax(box_cls, dim=-1)[:, :, :-1].max(-1) - - for i, (scores_per_image, labels_per_image, - box_pred_per_image) in enumerate( - zip(scores, labels, box_pred)): - if self.use_ensemble and self.sampling_timesteps > 1: - return box_pred_per_image, scores_per_image, \ - labels_per_image - - if self.use_nms: - det_bboxes, keep_idxs = batched_nms( - box_pred_per_image, scores_per_image, labels_per_image, - cfg.nms) - box_pred_per_image = box_pred_per_image[keep_idxs] - labels_per_image = labels_per_image[keep_idxs] - # some nms would reweight the score, such as softnms - scores_per_image = det_bboxes[:, -1] - - result = InstanceData() - result.bboxes = box_pred_per_image - result.scores = scores_per_image - result.labels = labels_per_image - results.append(result) - if self.use_ensemble and self.sampling_timesteps > 1: - return box_pred_list, scores_list, labels_list - else: - return results - - -@MODELS.register_module() -class SingleDiffusionDetHead(nn.Module): - - def __init__( - self, - num_classes=80, - feat_channels=256, - dim_feedforward=2048, - num_cls_convs=1, - num_reg_convs=3, - num_heads=8, - dropout=0.0, - pooler_resolution=7, - scale_clamp=_DEFAULT_SCALE_CLAMP, - bbox_weights=(2.0, 2.0, 1.0, 1.0), - use_focal_loss=True, - use_fed_loss=False, - act_cfg=dict(type='ReLU', inplace=True), - dynamic_conv=dict(dynamic_dim=64, dynamic_num=2) - ) -> None: - super().__init__() - self.feat_channels = feat_channels - - # Dynamic - self.self_attn = nn.MultiheadAttention( - feat_channels, num_heads, dropout=dropout) - self.inst_interact = DynamicConv( - feat_channels=feat_channels, - pooler_resolution=pooler_resolution, - dynamic_dim=dynamic_conv['dynamic_dim'], - dynamic_num=dynamic_conv['dynamic_num']) - - self.linear1 = nn.Linear(feat_channels, dim_feedforward) - self.dropout = nn.Dropout(dropout) - self.linear2 = nn.Linear(dim_feedforward, feat_channels) - - self.norm1 = nn.LayerNorm(feat_channels) - self.norm2 = nn.LayerNorm(feat_channels) - self.norm3 = nn.LayerNorm(feat_channels) - self.dropout1 = nn.Dropout(dropout) - self.dropout2 = nn.Dropout(dropout) - self.dropout3 = nn.Dropout(dropout) - - self.activation = build_activation_layer(act_cfg) - - # block time mlp - self.block_time_mlp = nn.Sequential( - nn.SiLU(), nn.Linear(feat_channels * 4, feat_channels * 2)) - - # cls. - cls_module = list() - for _ in range(num_cls_convs): - cls_module.append(nn.Linear(feat_channels, feat_channels, False)) - cls_module.append(nn.LayerNorm(feat_channels)) - cls_module.append(nn.ReLU(inplace=True)) - self.cls_module = nn.ModuleList(cls_module) - - # reg. - reg_module = list() - for _ in range(num_reg_convs): - reg_module.append(nn.Linear(feat_channels, feat_channels, False)) - reg_module.append(nn.LayerNorm(feat_channels)) - reg_module.append(nn.ReLU(inplace=True)) - self.reg_module = nn.ModuleList(reg_module) - - # pred. - self.use_focal_loss = use_focal_loss - self.use_fed_loss = use_fed_loss - if self.use_focal_loss or self.use_fed_loss: - self.class_logits = nn.Linear(feat_channels, num_classes) - else: - self.class_logits = nn.Linear(feat_channels, num_classes + 1) - self.bboxes_delta = nn.Linear(feat_channels, 4) - self.scale_clamp = scale_clamp - self.bbox_weights = bbox_weights - - def forward(self, features, bboxes, pro_features, pooler, time_emb): - """ - :param bboxes: (N, num_boxes, 4) - :param pro_features: (N, num_boxes, feat_channels) - """ - - N, num_boxes = bboxes.shape[:2] - - # roi_feature. - proposal_boxes = list() - for b in range(N): - proposal_boxes.append(bboxes[b]) - rois = bbox2roi(proposal_boxes) - - roi_features = pooler(features, rois) - - if pro_features is None: - pro_features = roi_features.view(N, num_boxes, self.feat_channels, - -1).mean(-1) - - roi_features = roi_features.view(N * num_boxes, self.feat_channels, - -1).permute(2, 0, 1) - - # self_att. - pro_features = pro_features.view(N, num_boxes, - self.feat_channels).permute(1, 0, 2) - pro_features2 = self.self_attn( - pro_features, pro_features, value=pro_features)[0] - pro_features = pro_features + self.dropout1(pro_features2) - pro_features = self.norm1(pro_features) - - # inst_interact. - pro_features = pro_features.view( - num_boxes, N, - self.feat_channels).permute(1, 0, - 2).reshape(1, N * num_boxes, - self.feat_channels) - pro_features2 = self.inst_interact(pro_features, roi_features) - pro_features = pro_features + self.dropout2(pro_features2) - obj_features = self.norm2(pro_features) - - # obj_feature. - obj_features2 = self.linear2( - self.dropout(self.activation(self.linear1(obj_features)))) - obj_features = obj_features + self.dropout3(obj_features2) - obj_features = self.norm3(obj_features) - - fc_feature = obj_features.transpose(0, 1).reshape(N * num_boxes, -1) - - scale_shift = self.block_time_mlp(time_emb) - scale_shift = torch.repeat_interleave(scale_shift, num_boxes, dim=0) - scale, shift = scale_shift.chunk(2, dim=1) - fc_feature = fc_feature * (scale + 1) + shift - - cls_feature = fc_feature.clone() - reg_feature = fc_feature.clone() - for cls_layer in self.cls_module: - cls_feature = cls_layer(cls_feature) - for reg_layer in self.reg_module: - reg_feature = reg_layer(reg_feature) - class_logits = self.class_logits(cls_feature) - bboxes_deltas = self.bboxes_delta(reg_feature) - pred_bboxes = self.apply_deltas(bboxes_deltas, bboxes.view(-1, 4)) - - return (class_logits.view(N, num_boxes, - -1), pred_bboxes.view(N, num_boxes, - -1), obj_features) - - def apply_deltas(self, deltas, boxes): - """Apply transformation `deltas` (dx, dy, dw, dh) to `boxes`. - - Args: - deltas (Tensor): transformation deltas of shape (N, k*4), - where k >= 1. deltas[i] represents k potentially - different class-specific box transformations for - the single box boxes[i]. - boxes (Tensor): boxes to transform, of shape (N, 4) - """ - boxes = boxes.to(deltas.dtype) - - widths = boxes[:, 2] - boxes[:, 0] - heights = boxes[:, 3] - boxes[:, 1] - ctr_x = boxes[:, 0] + 0.5 * widths - ctr_y = boxes[:, 1] + 0.5 * heights - - wx, wy, ww, wh = self.bbox_weights - dx = deltas[:, 0::4] / wx - dy = deltas[:, 1::4] / wy - dw = deltas[:, 2::4] / ww - dh = deltas[:, 3::4] / wh - - # Prevent sending too large values into torch.exp() - dw = torch.clamp(dw, max=self.scale_clamp) - dh = torch.clamp(dh, max=self.scale_clamp) - - pred_ctr_x = dx * widths[:, None] + ctr_x[:, None] - pred_ctr_y = dy * heights[:, None] + ctr_y[:, None] - pred_w = torch.exp(dw) * widths[:, None] - pred_h = torch.exp(dh) * heights[:, None] - - pred_boxes = torch.zeros_like(deltas) - pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w # x1 - pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h # y1 - pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w # x2 - pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h # y2 - - return pred_boxes - - -class DynamicConv(nn.Module): - - def __init__(self, - feat_channels: int, - dynamic_dim: int = 64, - dynamic_num: int = 2, - pooler_resolution: int = 7) -> None: - super().__init__() - - self.feat_channels = feat_channels - self.dynamic_dim = dynamic_dim - self.dynamic_num = dynamic_num - self.num_params = self.feat_channels * self.dynamic_dim - self.dynamic_layer = nn.Linear(self.feat_channels, - self.dynamic_num * self.num_params) - - self.norm1 = nn.LayerNorm(self.dynamic_dim) - self.norm2 = nn.LayerNorm(self.feat_channels) - - self.activation = nn.ReLU(inplace=True) - - num_output = self.feat_channels * pooler_resolution**2 - self.out_layer = nn.Linear(num_output, self.feat_channels) - self.norm3 = nn.LayerNorm(self.feat_channels) - - def forward(self, pro_features: Tensor, roi_features: Tensor) -> Tensor: - """Forward function. - - Args: - pro_features: (1, N * num_boxes, self.feat_channels) - roi_features: (49, N * num_boxes, self.feat_channels) - - Returns: - """ - features = roi_features.permute(1, 0, 2) - parameters = self.dynamic_layer(pro_features).permute(1, 0, 2) - - param1 = parameters[:, :, :self.num_params].view( - -1, self.feat_channels, self.dynamic_dim) - param2 = parameters[:, :, - self.num_params:].view(-1, self.dynamic_dim, - self.feat_channels) - - features = torch.bmm(features, param1) - features = self.norm1(features) - features = self.activation(features) - - features = torch.bmm(features, param2) - features = self.norm2(features) - features = self.activation(features) - - features = features.flatten(1) - features = self.out_layer(features) - features = self.norm3(features) - features = self.activation(features) - - return features diff --git a/projects/DiffusionDet/diffusiondet/loss.py b/projects/DiffusionDet/diffusiondet/loss.py deleted file mode 100644 index 3d532f1ff..000000000 --- a/projects/DiffusionDet/diffusiondet/loss.py +++ /dev/null @@ -1,341 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. - -# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved -# Modified from https://github.com/ShoufaChen/DiffusionDet/blob/main/diffusiondet/loss.py # noqa - -# This work is licensed under the CC-BY-NC 4.0 License. -# Users should be careful about adopting these features in any commercial matters. # noqa -# For more details, please refer to https://github.com/ShoufaChen/DiffusionDet/blob/main/LICENSE # noqa - -from typing import List, Tuple, Union - -import torch -import torch.nn as nn -from mmengine.config import ConfigDict -from mmengine.structures import InstanceData -from torch import Tensor - -from mmdet.registry import MODELS, TASK_UTILS -from mmdet.structures.bbox import bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh -from mmdet.utils import ConfigType - - -@TASK_UTILS.register_module() -class DiffusionDetCriterion(nn.Module): - - def __init__( - self, - num_classes, - assigner: Union[ConfigDict, nn.Module], - deep_supervision=True, - loss_cls=dict( - type='FocalLoss', - use_sigmoid=True, - alpha=0.25, - gamma=2.0, - reduction='sum', - loss_weight=2.0), - loss_bbox=dict(type='L1Loss', reduction='sum', loss_weight=5.0), - loss_giou=dict(type='GIoULoss', reduction='sum', loss_weight=2.0), - ): - - super().__init__() - self.num_classes = num_classes - - if isinstance(assigner, nn.Module): - self.assigner = assigner - else: - self.assigner = TASK_UTILS.build(assigner) - - self.deep_supervision = deep_supervision - - self.loss_cls = MODELS.build(loss_cls) - self.loss_bbox = MODELS.build(loss_bbox) - self.loss_giou = MODELS.build(loss_giou) - - def forward(self, outputs, batch_gt_instances, batch_img_metas): - batch_indices = self.assigner(outputs, batch_gt_instances, - batch_img_metas) - # Compute all the requested losses - loss_cls = self.loss_classification(outputs, batch_gt_instances, - batch_indices) - loss_bbox, loss_giou = self.loss_boxes(outputs, batch_gt_instances, - batch_indices) - - losses = dict( - loss_cls=loss_cls, loss_bbox=loss_bbox, loss_giou=loss_giou) - - if self.deep_supervision: - assert 'aux_outputs' in outputs - for i, aux_outputs in enumerate(outputs['aux_outputs']): - batch_indices = self.assigner(aux_outputs, batch_gt_instances, - batch_img_metas) - loss_cls = self.loss_classification(aux_outputs, - batch_gt_instances, - batch_indices) - loss_bbox, loss_giou = self.loss_boxes(aux_outputs, - batch_gt_instances, - batch_indices) - tmp_losses = dict( - loss_cls=loss_cls, - loss_bbox=loss_bbox, - loss_giou=loss_giou) - for name, value in tmp_losses.items(): - losses[f's.{i}.{name}'] = value - return losses - - def loss_classification(self, outputs, batch_gt_instances, indices): - assert 'pred_logits' in outputs - src_logits = outputs['pred_logits'] - target_classes_list = [ - gt.labels[J] for gt, (_, J) in zip(batch_gt_instances, indices) - ] - target_classes = torch.full( - src_logits.shape[:2], - self.num_classes, - dtype=torch.int64, - device=src_logits.device) - for idx in range(len(batch_gt_instances)): - target_classes[idx, indices[idx][0]] = target_classes_list[idx] - - src_logits = src_logits.flatten(0, 1) - target_classes = target_classes.flatten(0, 1) - # comp focal loss. - num_instances = max(torch.cat(target_classes_list).shape[0], 1) - loss_cls = self.loss_cls( - src_logits, - target_classes, - ) / num_instances - return loss_cls - - def loss_boxes(self, outputs, batch_gt_instances, indices): - assert 'pred_boxes' in outputs - pred_boxes = outputs['pred_boxes'] - - target_bboxes_norm_list = [ - gt.norm_bboxes_cxcywh[J] - for gt, (_, J) in zip(batch_gt_instances, indices) - ] - target_bboxes_list = [ - gt.bboxes[J] for gt, (_, J) in zip(batch_gt_instances, indices) - ] - - pred_bboxes_list = [] - pred_bboxes_norm_list = [] - for idx in range(len(batch_gt_instances)): - pred_bboxes_list.append(pred_boxes[idx, indices[idx][0]]) - image_size = batch_gt_instances[idx].image_size - pred_bboxes_norm_list.append(pred_boxes[idx, indices[idx][0]] / - image_size) - - pred_boxes_cat = torch.cat(pred_bboxes_list) - pred_boxes_norm_cat = torch.cat(pred_bboxes_norm_list) - target_bboxes_cat = torch.cat(target_bboxes_list) - target_bboxes_norm_cat = torch.cat(target_bboxes_norm_list) - - if len(pred_boxes_cat) > 0: - num_instances = pred_boxes_cat.shape[0] - - loss_bbox = self.loss_bbox( - pred_boxes_norm_cat, - bbox_cxcywh_to_xyxy(target_bboxes_norm_cat)) / num_instances - loss_giou = self.loss_giou(pred_boxes_cat, - target_bboxes_cat) / num_instances - else: - loss_bbox = pred_boxes.sum() * 0 - loss_giou = pred_boxes.sum() * 0 - return loss_bbox, loss_giou - - -@TASK_UTILS.register_module() -class DiffusionDetMatcher(nn.Module): - """This class computes an assignment between the targets and the - predictions of the network For efficiency reasons, the targets don't - include the no_object. - - Because of this, in general, there are more predictions than targets. In - this case, we do a 1-to-k (dynamic) matching of the best predictions, while - the others are un-matched (and thus treated as non-objects). - """ - - def __init__(self, - match_costs: Union[List[Union[dict, ConfigDict]], dict, - ConfigDict], - center_radius: float = 2.5, - candidate_topk: int = 5, - iou_calculator: ConfigType = dict(type='BboxOverlaps2D'), - **kwargs): - super().__init__() - - self.center_radius = center_radius - self.candidate_topk = candidate_topk - - if isinstance(match_costs, dict): - match_costs = [match_costs] - elif isinstance(match_costs, list): - assert len(match_costs) > 0, \ - 'match_costs must not be a empty list.' - self.use_focal_loss = False - self.use_fed_loss = False - for _match_cost in match_costs: - if _match_cost.get('type') == 'FocalLossCost': - self.use_focal_loss = True - if _match_cost.get('type') == 'FedLoss': - self.use_fed_loss = True - raise NotImplementedError - - self.match_costs = [ - TASK_UTILS.build(match_cost) for match_cost in match_costs - ] - self.iou_calculator = TASK_UTILS.build(iou_calculator) - - def forward(self, outputs, batch_gt_instances, batch_img_metas): - assert 'pred_logits' in outputs and 'pred_boxes' in outputs - - pred_logits = outputs['pred_logits'] - pred_bboxes = outputs['pred_boxes'] - batch_size = len(batch_gt_instances) - - assert batch_size == pred_logits.shape[0] == pred_bboxes.shape[0] - batch_indices = [] - for i in range(batch_size): - pred_instances = InstanceData() - pred_instances.bboxes = pred_bboxes[i, ...] - pred_instances.scores = pred_logits[i, ...] - gt_instances = batch_gt_instances[i] - img_meta = batch_img_metas[i] - indices = self.single_assigner(pred_instances, gt_instances, - img_meta) - batch_indices.append(indices) - return batch_indices - - def single_assigner(self, pred_instances, gt_instances, img_meta): - with torch.no_grad(): - gt_bboxes = gt_instances.bboxes - pred_bboxes = pred_instances.bboxes - num_gt = gt_bboxes.size(0) - - if num_gt == 0: # empty object in key frame - valid_mask = pred_bboxes.new_zeros((pred_bboxes.shape[0], ), - dtype=torch.bool) - matched_gt_inds = pred_bboxes.new_zeros((gt_bboxes.shape[0], ), - dtype=torch.long) - return valid_mask, matched_gt_inds - - valid_mask, is_in_boxes_and_center = \ - self.get_in_gt_and_in_center_info( - bbox_xyxy_to_cxcywh(pred_bboxes), - bbox_xyxy_to_cxcywh(gt_bboxes) - ) - - cost_list = [] - for match_cost in self.match_costs: - cost = match_cost( - pred_instances=pred_instances, - gt_instances=gt_instances, - img_meta=img_meta) - cost_list.append(cost) - - pairwise_ious = self.iou_calculator(pred_bboxes, gt_bboxes) - - cost_list.append((~is_in_boxes_and_center) * 100.0) - cost_matrix = torch.stack(cost_list).sum(0) - cost_matrix[~valid_mask] = cost_matrix[~valid_mask] + 10000.0 - - fg_mask_inboxes, matched_gt_inds = \ - self.dynamic_k_matching( - cost_matrix, pairwise_ious, num_gt) - return fg_mask_inboxes, matched_gt_inds - - def get_in_gt_and_in_center_info( - self, pred_bboxes: Tensor, - gt_bboxes: Tensor) -> Tuple[Tensor, Tensor]: - """Get the information of which prior is in gt bboxes and gt center - priors.""" - xy_target_gts = bbox_cxcywh_to_xyxy(gt_bboxes) # (x1, y1, x2, y2) - - pred_bboxes_center_x = pred_bboxes[:, 0].unsqueeze(1) - pred_bboxes_center_y = pred_bboxes[:, 1].unsqueeze(1) - - # whether the center of each anchor is inside a gt box - b_l = pred_bboxes_center_x > xy_target_gts[:, 0].unsqueeze(0) - b_r = pred_bboxes_center_x < xy_target_gts[:, 2].unsqueeze(0) - b_t = pred_bboxes_center_y > xy_target_gts[:, 1].unsqueeze(0) - b_b = pred_bboxes_center_y < xy_target_gts[:, 3].unsqueeze(0) - # (b_l.long()+b_r.long()+b_t.long()+b_b.long())==4 [300,num_gt] , - is_in_boxes = ((b_l.long() + b_r.long() + b_t.long() + - b_b.long()) == 4) - is_in_boxes_all = is_in_boxes.sum(1) > 0 # [num_query] - # in fixed center - center_radius = 2.5 - # Modified to self-adapted sampling --- the center size depends - # on the size of the gt boxes - # https://github.com/dulucas/UVO_Challenge/blob/main/Track1/detection/mmdet/core/bbox/assigners/rpn_sim_ota_assigner.py#L212 # noqa - b_l = pred_bboxes_center_x > ( - gt_bboxes[:, 0] - - (center_radius * - (xy_target_gts[:, 2] - xy_target_gts[:, 0]))).unsqueeze(0) - b_r = pred_bboxes_center_x < ( - gt_bboxes[:, 0] + - (center_radius * - (xy_target_gts[:, 2] - xy_target_gts[:, 0]))).unsqueeze(0) - b_t = pred_bboxes_center_y > ( - gt_bboxes[:, 1] - - (center_radius * - (xy_target_gts[:, 3] - xy_target_gts[:, 1]))).unsqueeze(0) - b_b = pred_bboxes_center_y < ( - gt_bboxes[:, 1] + - (center_radius * - (xy_target_gts[:, 3] - xy_target_gts[:, 1]))).unsqueeze(0) - - is_in_centers = ((b_l.long() + b_r.long() + b_t.long() + - b_b.long()) == 4) - is_in_centers_all = is_in_centers.sum(1) > 0 - - is_in_boxes_anchor = is_in_boxes_all | is_in_centers_all - is_in_boxes_and_center = (is_in_boxes & is_in_centers) - - return is_in_boxes_anchor, is_in_boxes_and_center - - def dynamic_k_matching(self, cost: Tensor, pairwise_ious: Tensor, - num_gt: int) -> Tuple[Tensor, Tensor]: - """Use IoU and matching cost to calculate the dynamic top-k positive - targets.""" - matching_matrix = torch.zeros_like(cost) - # select candidate topk ious for dynamic-k calculation - candidate_topk = min(self.candidate_topk, pairwise_ious.size(0)) - topk_ious, _ = torch.topk(pairwise_ious, candidate_topk, dim=0) - # calculate dynamic k for each gt - dynamic_ks = torch.clamp(topk_ious.sum(0).int(), min=1) - for gt_idx in range(num_gt): - _, pos_idx = torch.topk( - cost[:, gt_idx], k=dynamic_ks[gt_idx], largest=False) - matching_matrix[:, gt_idx][pos_idx] = 1 - - del topk_ious, dynamic_ks, pos_idx - - prior_match_gt_mask = matching_matrix.sum(1) > 1 - if prior_match_gt_mask.sum() > 0: - _, cost_argmin = torch.min(cost[prior_match_gt_mask, :], dim=1) - matching_matrix[prior_match_gt_mask, :] *= 0 - matching_matrix[prior_match_gt_mask, cost_argmin] = 1 - - while (matching_matrix.sum(0) == 0).any(): - matched_query_id = matching_matrix.sum(1) > 0 - cost[matched_query_id] += 100000.0 - unmatch_id = torch.nonzero( - matching_matrix.sum(0) == 0, as_tuple=False).squeeze(1) - for gt_idx in unmatch_id: - pos_idx = torch.argmin(cost[:, gt_idx]) - matching_matrix[:, gt_idx][pos_idx] = 1.0 - if (matching_matrix.sum(1) > 1).sum() > 0: - _, cost_argmin = torch.min(cost[prior_match_gt_mask], dim=1) - matching_matrix[prior_match_gt_mask] *= 0 - matching_matrix[prior_match_gt_mask, cost_argmin, ] = 1 - - assert not (matching_matrix.sum(0) == 0).any() - # get foreground mask inside box and center prior - fg_mask_inboxes = matching_matrix.sum(1) > 0 - matched_gt_inds = matching_matrix[fg_mask_inboxes, :].argmax(1) - - return fg_mask_inboxes, matched_gt_inds diff --git a/projects/DiffusionDet/model_converters/diffusiondet_resnet_to_mmdet.py b/projects/DiffusionDet/model_converters/diffusiondet_resnet_to_mmdet.py deleted file mode 100644 index 101abd831..000000000 --- a/projects/DiffusionDet/model_converters/diffusiondet_resnet_to_mmdet.py +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import argparse -from collections import OrderedDict - -import numpy as np -import torch -from mmengine.fileio import load - - -def convert(src, dst): - if src.endswith('pth'): - src_model = torch.load(src) - else: - src_model = load(src) - - dst_state_dict = OrderedDict() - for k, v in src_model['model'].items(): - key_name_split = k.split('.') - if 'backbone.fpn_lateral' in k: - lateral_id = int(key_name_split[-2][-1]) - name = f'neck.lateral_convs.{lateral_id - 2}.' \ - f'conv.{key_name_split[-1]}' - elif 'backbone.fpn_output' in k: - lateral_id = int(key_name_split[-2][-1]) - name = f'neck.fpn_convs.{lateral_id - 2}.conv.' \ - f'{key_name_split[-1]}' - elif 'backbone.bottom_up.stem.conv1.norm.' in k: - name = f'backbone.bn1.{key_name_split[-1]}' - elif 'backbone.bottom_up.stem.conv1.' in k: - name = f'backbone.conv1.{key_name_split[-1]}' - elif 'backbone.bottom_up.res' in k: - # weight_type = key_name_split[-1] - res_id = int(key_name_split[2][-1]) - 1 - # deal with short cut - if 'shortcut' in key_name_split[4]: - if 'shortcut' == key_name_split[-2]: - name = f'backbone.layer{res_id}.' \ - f'{key_name_split[3]}.downsample.0.' \ - f'{key_name_split[-1]}' - elif 'shortcut' == key_name_split[-3]: - name = f'backbone.layer{res_id}.' \ - f'{key_name_split[3]}.downsample.1.' \ - f'{key_name_split[-1]}' - else: - print(f'Unvalid key {k}') - # deal with conv - elif 'conv' in key_name_split[-2]: - conv_id = int(key_name_split[-2][-1]) - name = f'backbone.layer{res_id}.{key_name_split[3]}' \ - f'.conv{conv_id}.{key_name_split[-1]}' - # deal with BN - elif key_name_split[-2] == 'norm': - conv_id = int(key_name_split[-3][-1]) - name = f'backbone.layer{res_id}.{key_name_split[3]}.' \ - f'bn{conv_id}.{key_name_split[-1]}' - else: - print(f'{k} is invalid') - - elif key_name_split[0] == 'head': - # d2: head.xxx -> mmdet: bbox_head.xxx - name = f'bbox_{k}' - else: - # some base parameters such as beta will not convert - print(f'{k} is not converted!!') - continue - - if not isinstance(v, np.ndarray) and not isinstance(v, torch.Tensor): - raise ValueError( - 'Unsupported type found in checkpoint! {}: {}'.format( - k, type(v))) - if not isinstance(v, torch.Tensor): - dst_state_dict[name] = torch.from_numpy(v) - else: - dst_state_dict[name] = v - mmdet_model = dict(state_dict=dst_state_dict, meta=dict()) - torch.save(mmdet_model, dst) - - -def main(): - parser = argparse.ArgumentParser(description='Convert model keys') - parser.add_argument('src', help='src detectron model path') - parser.add_argument('dst', help='save path') - args = parser.parse_args() - convert(args.src, args.dst) - - -if __name__ == '__main__': - main() diff --git a/projects/EfficientDet/README.md b/projects/EfficientDet/README.md deleted file mode 100644 index 36f4ed403..000000000 --- a/projects/EfficientDet/README.md +++ /dev/null @@ -1,154 +0,0 @@ -# EfficientDet - -> [**EfficientDet: Scalable and Efficient Object Detection**](https://arxiv.org/pdf/1911.09070.pdf), -> Mingxing Tan, Ruoming Pang, Quoc V. Le, -> *CVPR 2020* - -## Abstract - -This is an implementation of [EfficientDet](https://github.com/google/automl) based on [MMDetection](https://github.com/open-mmlab/mmdetection/tree/main), [MMCV](https://github.com/open-mmlab/mmcv), and [MMEngine](https://github.com/open-mmlab/mmengine). -
-EfficientDet a new family of object detectors, which consistently achieve much better efficiency than prior art across a wide -spectrum of resource constraints. -In particular, with single model and single-scale, EfficientDet-D7 achieves stateof-the-art 55.1 AP on COCO test-dev with 77M parameters and 410B FLOP. -
-BiFPN is a simple yet highly effective weighted bi-directional feature pyramid network, which introduces learnable weights to learn the importance of different input features, while repeatedly applying topdown and bottom-up multi-scale feature fusion. -
-In contrast to other feature pyramid network, such as FPN, FPN + PAN, NAS-FPN, BiFPN achieves the best accuracy with fewer parameters and FLOPs. - -
- -
- -## Usage - -## Official TensorFlow Model - -This project also supports [official tensorflow model](https://github.com/google/automl), it uses 90 categories and yxyx box encoding in training. If you want to use the original model weight to get official results, please refer to the following steps. - -### Model conversion - -Firstly, download EfficientDet [weights](https://github.com/google/automl/tree/master/efficientdet) and unzip, please use the following command - -```bash -tar -xzvf {EFFICIENTDET_WEIGHT} -``` - -Then, install tensorflow, please use the following command - -```bash -pip install tensorflow-gpu==2.6.0 -``` - -Lastly, convert weights from tensorflow to pytorch, please use the following command - -```bash -python projects/EfficientDet/convert_tf_to_pt.py --backbone {BACKBONE_NAME} --tensorflow_weight {TENSORFLOW_WEIGHT_PATH} --out_weight {OUT_PATH} -``` - -### Testing commands - -In MMDetection's root directory, run the following command to test the model: - -```bash -python tools/test.py projects/EfficientDet/configs/tensorflow/efficientdet_effb0_bifpn_8xb16-crop512-300e_coco_tf.py ${CHECKPOINT_PATH} -``` - -## Reproduce Model - -For convenience, we recommend the current implementation version, it uses 80 categories and xyxy encoding in training. On this basis, a higher result was finally achieved. - -### Training commands - -In MMDetection's root directory, run the following command to train the model: - -```bash -python tools/train.py projects/EfficientDet/configs/efficientdet_effb3_bifpn_8xb16-crop896-300e_coco.py -``` - -### Testing commands - -In MMDetection's root directory, run the following command to test the model: - -```bash -python tools/test.py projects/EfficientDet/configs/efficientdet_effb3_bifpn_8xb16-crop896-300e_coco.py ${CHECKPOINT_PATH} -``` - -## Results - -Based on mmdetection, this project aligns the accuracy of the [official model](https://github.com/google/automl). - -| Method | Backbone | Pretrained Model | Training set | Test set | Epoch | Val Box AP | Official AP | Download | -| :------------------------------------------------------------------------------------------------------------------: | :-------------: | :--------------: | :------------: | :----------: | :---: | :--------: | :---------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| [efficientdet-d0\*](projects/EfficientDet/configs/tensorflow/efficientdet_effb0_bifpn_8xb16-crop512-300e_coco_tf.py) | efficientnet-b0 | ImageNet | COCO2017 Train | COCO2017 Val | 300 | 34.4 | 34.3 | | -| [efficientdet-d3](projects/EfficientDet/configs/efficientdet_effb3_bifpn_8xb16-crop896-300e_coco.py) | efficientnet-b3 | ImageNet | COCO2017 Train | COCO2017 Val | 300 | 47.2 | 46.8 | [model](https://download.openmmlab.com/mmdetection/v3.0/efficientdet/efficientdet_effb3_bifpn_8xb16-crop896-300e_coco/efficientdet_effb3_bifpn_8xb16-crop896-300e_coco_20230223_122457-e6f7a833.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/efficientdet/efficientdet_effb3_bifpn_8xb16-crop896-300e_coco/efficientdet_effb3_bifpn_8xb16-crop896-300e_coco_20230223_122457.log.json) | - -**Note**: -\*means use [official tensorflow model](https://github.com/google/automl) weights to test. - -## Citation - -```BibTeX -@inproceedings{tan2020efficientdet, - title={Efficientdet: Scalable and efficient object detection}, - author={Tan, Mingxing and Pang, Ruoming and Le, Quoc V}, - booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition}, - pages={10781--10790}, - year={2020} -} -``` - -## Checklist - - - -- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`. - - - [x] Finish the code - - - - - [x] Basic docstrings & proper citation - - - - - [x] Test-time correctness - - - - - [x] A full README - - - -- [x] Milestone 2: Indicates a successful model implementation. - - - [x] Training-time correctness - - - -- [ ] Milestone 3: Good to be a part of our core package! - - - [ ] Type hints and docstrings - - - - - [ ] Unit tests - - - - - [ ] Code polishing - - - - - [ ] Metafile.yml - - - -- [ ] Move your modules into the core package following the codebase's file hierarchy structure. - - - -- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure. diff --git a/projects/EfficientDet/configs/efficientdet_effb0_bifpn_8xb16-crop512-300e_coco.py b/projects/EfficientDet/configs/efficientdet_effb0_bifpn_8xb16-crop512-300e_coco.py deleted file mode 100644 index c7a3b3092..000000000 --- a/projects/EfficientDet/configs/efficientdet_effb0_bifpn_8xb16-crop512-300e_coco.py +++ /dev/null @@ -1,171 +0,0 @@ -_base_ = [ - 'mmdet::_base_/datasets/coco_detection.py', - 'mmdet::_base_/schedules/schedule_1x.py', - 'mmdet::_base_/default_runtime.py' -] -custom_imports = dict( - imports=['projects.EfficientDet.efficientdet'], allow_failed_imports=False) - -image_size = 512 -batch_augments = [ - dict(type='BatchFixedSizePad', size=(image_size, image_size)) -] -dataset_type = 'CocoDataset' -evalute_type = 'CocoMetric' -norm_cfg = dict(type='SyncBN', requires_grad=True, eps=1e-3, momentum=0.01) -checkpoint = 'https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b0_3rdparty_8xb32-aa-advprop_in1k_20220119-26434485.pth' # noqa -model = dict( - type='EfficientDet', - data_preprocessor=dict( - type='DetDataPreprocessor', - mean=[123.675, 116.28, 103.53], - std=[58.395, 57.12, 57.375], - bgr_to_rgb=True, - pad_size_divisor=image_size, - batch_augments=batch_augments), - backbone=dict( - type='EfficientNet', - arch='b0', - drop_path_rate=0.2, - out_indices=(3, 4, 5), - frozen_stages=0, - conv_cfg=dict(type='Conv2dSamePadding'), - norm_cfg=norm_cfg, - norm_eval=False, - init_cfg=dict( - type='Pretrained', prefix='backbone', checkpoint=checkpoint)), - neck=dict( - type='BiFPN', - num_stages=3, - in_channels=[40, 112, 320], - out_channels=64, - start_level=0, - norm_cfg=norm_cfg), - bbox_head=dict( - type='EfficientDetSepBNHead', - num_classes=80, - num_ins=5, - in_channels=64, - feat_channels=64, - stacked_convs=3, - norm_cfg=norm_cfg, - anchor_generator=dict( - type='AnchorGenerator', - octave_base_scale=4, - scales_per_octave=3, - ratios=[1.0, 0.5, 2.0], - strides=[8, 16, 32, 64, 128], - center_offset=0.5), - bbox_coder=dict( - type='DeltaXYWHBBoxCoder', - target_means=[.0, .0, .0, .0], - target_stds=[1.0, 1.0, 1.0, 1.0]), - loss_cls=dict( - type='FocalLoss', - use_sigmoid=True, - gamma=1.5, - alpha=0.25, - loss_weight=1.0), - loss_bbox=dict(type='HuberLoss', beta=0.1, loss_weight=50)), - # training and testing settings - train_cfg=dict( - assigner=dict( - type='MaxIoUAssigner', - pos_iou_thr=0.5, - neg_iou_thr=0.5, - min_pos_iou=0, - ignore_iof_thr=-1), - sampler=dict( - type='PseudoSampler'), # Focal loss should use PseudoSampler - allowed_border=-1, - pos_weight=-1, - debug=False), - test_cfg=dict( - nms_pre=1000, - min_bbox_size=0, - score_thr=0.05, - nms=dict( - type='soft_nms', - iou_threshold=0.3, - sigma=0.5, - min_score=1e-3, - method='gaussian'), - max_per_img=100)) - -# dataset settings -train_pipeline = [ - dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), - dict(type='LoadAnnotations', with_bbox=True), - dict( - type='RandomResize', - scale=(image_size, image_size), - ratio_range=(0.1, 2.0), - keep_ratio=True), - dict(type='RandomCrop', crop_size=(image_size, image_size)), - dict(type='RandomFlip', prob=0.5), - dict(type='PackDetInputs') -] -test_pipeline = [ - dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), - dict(type='Resize', scale=(image_size, image_size), keep_ratio=True), - dict(type='LoadAnnotations', with_bbox=True), - dict( - type='PackDetInputs', - meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', - 'scale_factor')) -] - -train_dataloader = dict( - batch_size=16, - num_workers=8, - dataset=dict(type=dataset_type, pipeline=train_pipeline)) -val_dataloader = dict(dataset=dict(type=dataset_type, pipeline=test_pipeline)) -test_dataloader = val_dataloader - -val_evaluator = dict(type=evalute_type) -test_evaluator = val_evaluator - -optim_wrapper = dict( - optimizer=dict(lr=0.16, weight_decay=4e-5), - paramwise_cfg=dict( - norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True), - clip_grad=dict(max_norm=10, norm_type=2)) - -# learning policy -max_epochs = 300 -param_scheduler = [ - dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=917), - dict( - type='CosineAnnealingLR', - eta_min=0.0, - begin=1, - T_max=299, - end=300, - by_epoch=True, - convert_to_iter_based=True) -] -train_cfg = dict(max_epochs=max_epochs, val_interval=1) - -vis_backends = [ - dict(type='LocalVisBackend'), - dict(type='TensorboardVisBackend') -] -visualizer = dict( - type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer') - -default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=15)) -custom_hooks = [ - dict( - type='EMAHook', - ema_type='ExpMomentumEMA', - momentum=0.0002, - update_buffers=True, - priority=49) -] -# cudnn_benchmark=True can accelerate fix-size training -env_cfg = dict(cudnn_benchmark=True) - -# NOTE: `auto_scale_lr` is for automatically scaling LR, -# USER SHOULD NOT CHANGE ITS VALUES. -# base_batch_size = (8 GPUs) x (16 samples per GPU) -auto_scale_lr = dict(base_batch_size=128) diff --git a/projects/EfficientDet/configs/efficientdet_effb3_bifpn_8xb16-crop896-300e_coco-90cls.py b/projects/EfficientDet/configs/efficientdet_effb3_bifpn_8xb16-crop896-300e_coco-90cls.py deleted file mode 100644 index fe82a5e1b..000000000 --- a/projects/EfficientDet/configs/efficientdet_effb3_bifpn_8xb16-crop896-300e_coco-90cls.py +++ /dev/null @@ -1,171 +0,0 @@ -_base_ = [ - 'mmdet::_base_/datasets/coco_detection.py', - 'mmdet::_base_/schedules/schedule_1x.py', - 'mmdet::_base_/default_runtime.py' -] -custom_imports = dict( - imports=['projects.EfficientDet.efficientdet'], allow_failed_imports=False) - -image_size = 896 -batch_augments = [ - dict(type='BatchFixedSizePad', size=(image_size, image_size)) -] -dataset_type = 'Coco90Dataset' -evalute_type = 'Coco90Metric' -norm_cfg = dict(type='SyncBN', requires_grad=True, eps=1e-3, momentum=0.01) -checkpoint = 'https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b3_3rdparty_8xb32-aa-advprop_in1k_20220119-53b41118.pth' # noqa -model = dict( - type='EfficientDet', - data_preprocessor=dict( - type='DetDataPreprocessor', - mean=[123.675, 116.28, 103.53], - std=[58.395, 57.12, 57.375], - bgr_to_rgb=True, - pad_size_divisor=image_size, - batch_augments=batch_augments), - backbone=dict( - type='EfficientNet', - arch='b3', - drop_path_rate=0.3, - out_indices=(3, 4, 5), - frozen_stages=0, - conv_cfg=dict(type='Conv2dSamePadding'), - norm_cfg=norm_cfg, - norm_eval=False, - init_cfg=dict( - type='Pretrained', prefix='backbone', checkpoint=checkpoint)), - neck=dict( - type='BiFPN', - num_stages=6, - in_channels=[48, 136, 384], - out_channels=160, - start_level=0, - norm_cfg=norm_cfg), - bbox_head=dict( - type='EfficientDetSepBNHead', - num_classes=90, - num_ins=5, - in_channels=160, - feat_channels=160, - stacked_convs=4, - norm_cfg=norm_cfg, - anchor_generator=dict( - type='AnchorGenerator', - octave_base_scale=4, - scales_per_octave=3, - ratios=[1.0, 0.5, 2.0], - strides=[8, 16, 32, 64, 128], - center_offset=0.5), - bbox_coder=dict( - type='DeltaXYWHBBoxCoder', - target_means=[.0, .0, .0, .0], - target_stds=[1.0, 1.0, 1.0, 1.0]), - loss_cls=dict( - type='FocalLoss', - use_sigmoid=True, - gamma=1.5, - alpha=0.25, - loss_weight=1.0), - loss_bbox=dict(type='HuberLoss', beta=0.1, loss_weight=50)), - # training and testing settings - train_cfg=dict( - assigner=dict( - type='MaxIoUAssigner', - pos_iou_thr=0.5, - neg_iou_thr=0.5, - min_pos_iou=0, - ignore_iof_thr=-1), - sampler=dict( - type='PseudoSampler'), # Focal loss should use PseudoSampler - allowed_border=-1, - pos_weight=-1, - debug=False), - test_cfg=dict( - nms_pre=1000, - min_bbox_size=0, - score_thr=0.05, - nms=dict( - type='soft_nms', - iou_threshold=0.3, - sigma=0.5, - min_score=1e-3, - method='gaussian'), - max_per_img=100)) - -# dataset settings -train_pipeline = [ - dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), - dict(type='LoadAnnotations', with_bbox=True), - dict( - type='RandomResize', - scale=(image_size, image_size), - ratio_range=(0.1, 2.0), - keep_ratio=True), - dict(type='RandomCrop', crop_size=(image_size, image_size)), - dict(type='RandomFlip', prob=0.5), - dict(type='PackDetInputs') -] -test_pipeline = [ - dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), - dict(type='Resize', scale=(image_size, image_size), keep_ratio=True), - dict(type='LoadAnnotations', with_bbox=True), - dict( - type='PackDetInputs', - meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', - 'scale_factor')) -] - -train_dataloader = dict( - batch_size=16, - num_workers=8, - dataset=dict(type=dataset_type, pipeline=train_pipeline)) -val_dataloader = dict(dataset=dict(type=dataset_type, pipeline=test_pipeline)) -test_dataloader = val_dataloader - -val_evaluator = dict(type=evalute_type) -test_evaluator = val_evaluator - -optim_wrapper = dict( - optimizer=dict(lr=0.16, weight_decay=4e-5), - paramwise_cfg=dict( - norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True), - clip_grad=dict(max_norm=10, norm_type=2)) - -# learning policy -max_epochs = 300 -param_scheduler = [ - dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=917), - dict( - type='CosineAnnealingLR', - eta_min=0.0, - begin=1, - T_max=299, - end=300, - by_epoch=True, - convert_to_iter_based=True) -] -train_cfg = dict(max_epochs=max_epochs, val_interval=1) - -vis_backends = [ - dict(type='LocalVisBackend'), - dict(type='TensorboardVisBackend') -] -visualizer = dict( - type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer') - -default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=15)) -custom_hooks = [ - dict( - type='EMAHook', - ema_type='ExpMomentumEMA', - momentum=0.0002, - update_buffers=True, - priority=49) -] -# cudnn_benchmark=True can accelerate fix-size training -env_cfg = dict(cudnn_benchmark=True) - -# NOTE: `auto_scale_lr` is for automatically scaling LR, -# USER SHOULD NOT CHANGE ITS VALUES. -# base_batch_size = (8 GPUs) x (16 samples per GPU) -auto_scale_lr = dict(base_batch_size=128) diff --git a/projects/EfficientDet/configs/efficientdet_effb3_bifpn_8xb16-crop896-300e_coco.py b/projects/EfficientDet/configs/efficientdet_effb3_bifpn_8xb16-crop896-300e_coco.py deleted file mode 100644 index 2079e2ac6..000000000 --- a/projects/EfficientDet/configs/efficientdet_effb3_bifpn_8xb16-crop896-300e_coco.py +++ /dev/null @@ -1,171 +0,0 @@ -_base_ = [ - 'mmdet::_base_/datasets/coco_detection.py', - 'mmdet::_base_/schedules/schedule_1x.py', - 'mmdet::_base_/default_runtime.py' -] -custom_imports = dict( - imports=['projects.EfficientDet.efficientdet'], allow_failed_imports=False) - -image_size = 896 -batch_augments = [ - dict(type='BatchFixedSizePad', size=(image_size, image_size)) -] -dataset_type = 'CocoDataset' -evalute_type = 'CocoMetric' -norm_cfg = dict(type='SyncBN', requires_grad=True, eps=1e-3, momentum=0.01) -checkpoint = 'https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b3_3rdparty_8xb32-aa-advprop_in1k_20220119-53b41118.pth' # noqa -model = dict( - type='EfficientDet', - data_preprocessor=dict( - type='DetDataPreprocessor', - mean=[123.675, 116.28, 103.53], - std=[58.395, 57.12, 57.375], - bgr_to_rgb=True, - pad_size_divisor=image_size, - batch_augments=batch_augments), - backbone=dict( - type='EfficientNet', - arch='b3', - drop_path_rate=0.3, - out_indices=(3, 4, 5), - frozen_stages=0, - conv_cfg=dict(type='Conv2dSamePadding'), - norm_cfg=norm_cfg, - norm_eval=False, - init_cfg=dict( - type='Pretrained', prefix='backbone', checkpoint=checkpoint)), - neck=dict( - type='BiFPN', - num_stages=6, - in_channels=[48, 136, 384], - out_channels=160, - start_level=0, - norm_cfg=norm_cfg), - bbox_head=dict( - type='EfficientDetSepBNHead', - num_classes=80, - num_ins=5, - in_channels=160, - feat_channels=160, - stacked_convs=4, - norm_cfg=norm_cfg, - anchor_generator=dict( - type='AnchorGenerator', - octave_base_scale=4, - scales_per_octave=3, - ratios=[1.0, 0.5, 2.0], - strides=[8, 16, 32, 64, 128], - center_offset=0.5), - bbox_coder=dict( - type='DeltaXYWHBBoxCoder', - target_means=[.0, .0, .0, .0], - target_stds=[1.0, 1.0, 1.0, 1.0]), - loss_cls=dict( - type='FocalLoss', - use_sigmoid=True, - gamma=1.5, - alpha=0.25, - loss_weight=1.0), - loss_bbox=dict(type='HuberLoss', beta=0.1, loss_weight=50)), - # training and testing settings - train_cfg=dict( - assigner=dict( - type='MaxIoUAssigner', - pos_iou_thr=0.5, - neg_iou_thr=0.5, - min_pos_iou=0, - ignore_iof_thr=-1), - sampler=dict( - type='PseudoSampler'), # Focal loss should use PseudoSampler - allowed_border=-1, - pos_weight=-1, - debug=False), - test_cfg=dict( - nms_pre=1000, - min_bbox_size=0, - score_thr=0.05, - nms=dict( - type='soft_nms', - iou_threshold=0.3, - sigma=0.5, - min_score=1e-3, - method='gaussian'), - max_per_img=100)) - -# dataset settings -train_pipeline = [ - dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), - dict(type='LoadAnnotations', with_bbox=True), - dict( - type='RandomResize', - scale=(image_size, image_size), - ratio_range=(0.1, 2.0), - keep_ratio=True), - dict(type='RandomCrop', crop_size=(image_size, image_size)), - dict(type='RandomFlip', prob=0.5), - dict(type='PackDetInputs') -] -test_pipeline = [ - dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), - dict(type='Resize', scale=(image_size, image_size), keep_ratio=True), - dict(type='LoadAnnotations', with_bbox=True), - dict( - type='PackDetInputs', - meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', - 'scale_factor')) -] - -train_dataloader = dict( - batch_size=16, - num_workers=8, - dataset=dict(type=dataset_type, pipeline=train_pipeline)) -val_dataloader = dict(dataset=dict(type=dataset_type, pipeline=test_pipeline)) -test_dataloader = val_dataloader - -val_evaluator = dict(type=evalute_type) -test_evaluator = val_evaluator - -optim_wrapper = dict( - optimizer=dict(lr=0.16, weight_decay=4e-5), - paramwise_cfg=dict( - norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True), - clip_grad=dict(max_norm=10, norm_type=2)) - -# learning policy -max_epochs = 300 -param_scheduler = [ - dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=917), - dict( - type='CosineAnnealingLR', - eta_min=0.0, - begin=1, - T_max=299, - end=300, - by_epoch=True, - convert_to_iter_based=True) -] -train_cfg = dict(max_epochs=max_epochs, val_interval=1) - -vis_backends = [ - dict(type='LocalVisBackend'), - dict(type='TensorboardVisBackend') -] -visualizer = dict( - type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer') - -default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=15)) -custom_hooks = [ - dict( - type='EMAHook', - ema_type='ExpMomentumEMA', - momentum=0.0002, - update_buffers=True, - priority=49) -] -# cudnn_benchmark=True can accelerate fix-size training -env_cfg = dict(cudnn_benchmark=True) - -# NOTE: `auto_scale_lr` is for automatically scaling LR, -# USER SHOULD NOT CHANGE ITS VALUES. -# base_batch_size = (8 GPUs) x (16 samples per GPU) -auto_scale_lr = dict(base_batch_size=128) diff --git a/projects/EfficientDet/configs/tensorflow/efficientdet_effb0_bifpn_8xb16-crop512-300e_coco_tf.py b/projects/EfficientDet/configs/tensorflow/efficientdet_effb0_bifpn_8xb16-crop512-300e_coco_tf.py deleted file mode 100644 index bf3d3fc17..000000000 --- a/projects/EfficientDet/configs/tensorflow/efficientdet_effb0_bifpn_8xb16-crop512-300e_coco_tf.py +++ /dev/null @@ -1,171 +0,0 @@ -_base_ = [ - 'mmdet::_base_/datasets/coco_detection.py', - 'mmdet::_base_/schedules/schedule_1x.py', - 'mmdet::_base_/default_runtime.py' -] -custom_imports = dict( - imports=['projects.EfficientDet.efficientdet'], allow_failed_imports=False) - -image_size = 512 -batch_augments = [ - dict(type='BatchFixedSizePad', size=(image_size, image_size)) -] -dataset_type = 'Coco90Dataset' -evalute_type = 'Coco90Metric' -norm_cfg = dict(type='SyncBN', requires_grad=True, eps=1e-3, momentum=0.01) -checkpoint = 'https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b0_3rdparty_8xb32-aa-advprop_in1k_20220119-26434485.pth' # noqa -model = dict( - type='EfficientDet', - data_preprocessor=dict( - type='DetDataPreprocessor', - mean=[123.675, 116.28, 103.53], - std=[58.395, 57.12, 57.375], - bgr_to_rgb=True, - pad_size_divisor=image_size, - batch_augments=batch_augments), - backbone=dict( - type='EfficientNet', - arch='b0', - drop_path_rate=0.2, - out_indices=(3, 4, 5), - frozen_stages=0, - conv_cfg=dict(type='Conv2dSamePadding'), - norm_cfg=norm_cfg, - norm_eval=False, - init_cfg=dict( - type='Pretrained', prefix='backbone', checkpoint=checkpoint)), - neck=dict( - type='BiFPN', - num_stages=3, - in_channels=[40, 112, 320], - out_channels=64, - start_level=0, - norm_cfg=norm_cfg), - bbox_head=dict( - type='EfficientDetSepBNHead', - num_classes=90, - num_ins=5, - in_channels=64, - feat_channels=64, - stacked_convs=3, - norm_cfg=norm_cfg, - anchor_generator=dict( - type='YXYXAnchorGenerator', - octave_base_scale=4, - scales_per_octave=3, - ratios=[1.0, 0.5, 2.0], - strides=[8, 16, 32, 64, 128], - center_offset=0.5), - bbox_coder=dict( - type='YXYXDeltaXYWHBBoxCoder', - target_means=[.0, .0, .0, .0], - target_stds=[1.0, 1.0, 1.0, 1.0]), - loss_cls=dict( - type='FocalLoss', - use_sigmoid=True, - gamma=1.5, - alpha=0.25, - loss_weight=1.0), - loss_bbox=dict(type='HuberLoss', beta=0.1, loss_weight=50)), - # training and testing settings - train_cfg=dict( - assigner=dict( - type='TransMaxIoUAssigner', - pos_iou_thr=0.5, - neg_iou_thr=0.5, - min_pos_iou=0, - ignore_iof_thr=-1), - sampler=dict( - type='PseudoSampler'), # Focal loss should use PseudoSampler - allowed_border=-1, - pos_weight=-1, - debug=False), - test_cfg=dict( - nms_pre=1000, - min_bbox_size=0, - score_thr=0.05, - nms=dict( - type='soft_nms', - iou_threshold=0.3, - sigma=0.5, - min_score=1e-3, - method='gaussian'), - max_per_img=100)) - -# dataset settings -train_pipeline = [ - dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), - dict(type='LoadAnnotations', with_bbox=True), - dict( - type='RandomResize', - scale=(image_size, image_size), - ratio_range=(0.1, 2.0), - keep_ratio=True), - dict(type='RandomCrop', crop_size=(image_size, image_size)), - dict(type='RandomFlip', prob=0.5), - dict(type='PackDetInputs') -] -test_pipeline = [ - dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), - dict(type='Resize', scale=(image_size, image_size), keep_ratio=True), - dict(type='LoadAnnotations', with_bbox=True), - dict( - type='PackDetInputs', - meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', - 'scale_factor')) -] - -train_dataloader = dict( - batch_size=16, - num_workers=8, - dataset=dict(type=dataset_type, pipeline=train_pipeline)) -val_dataloader = dict(dataset=dict(type=dataset_type, pipeline=test_pipeline)) -test_dataloader = val_dataloader - -val_evaluator = dict(type=evalute_type) -test_evaluator = val_evaluator - -optim_wrapper = dict( - optimizer=dict(lr=0.16, weight_decay=4e-5), - paramwise_cfg=dict( - norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True), - clip_grad=dict(max_norm=10, norm_type=2)) - -# learning policy -max_epochs = 300 -param_scheduler = [ - dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=917), - dict( - type='CosineAnnealingLR', - eta_min=0.0, - begin=1, - T_max=299, - end=300, - by_epoch=True, - convert_to_iter_based=True) -] -train_cfg = dict(max_epochs=max_epochs, val_interval=1) - -vis_backends = [ - dict(type='LocalVisBackend'), - dict(type='TensorboardVisBackend') -] -visualizer = dict( - type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer') - -default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=15)) -custom_hooks = [ - dict( - type='EMAHook', - ema_type='ExpMomentumEMA', - momentum=0.0002, - update_buffers=True, - priority=49) -] -# cudnn_benchmark=True can accelerate fix-size training -env_cfg = dict(cudnn_benchmark=True) - -# NOTE: `auto_scale_lr` is for automatically scaling LR, -# USER SHOULD NOT CHANGE ITS VALUES. -# base_batch_size = (8 GPUs) x (16 samples per GPU) -auto_scale_lr = dict(base_batch_size=128) diff --git a/projects/EfficientDet/convert_tf_to_pt.py b/projects/EfficientDet/convert_tf_to_pt.py deleted file mode 100644 index f3b127f2a..000000000 --- a/projects/EfficientDet/convert_tf_to_pt.py +++ /dev/null @@ -1,626 +0,0 @@ -import argparse - -import numpy as np -import torch -from tensorflow.python.training import py_checkpoint_reader - -torch.set_printoptions(precision=20) - - -def tf2pth(v): - if v.ndim == 4: - return np.ascontiguousarray(v.transpose(3, 2, 0, 1)) - elif v.ndim == 2: - return np.ascontiguousarray(v.transpose()) - return v - - -def convert_key(model_name, bifpn_repeats, weights): - - p6_w1 = [ - torch.tensor([-1e4, -1e4], dtype=torch.float64) - for _ in range(bifpn_repeats) - ] - p5_w1 = [ - torch.tensor([-1e4, -1e4], dtype=torch.float64) - for _ in range(bifpn_repeats) - ] - p4_w1 = [ - torch.tensor([-1e4, -1e4], dtype=torch.float64) - for _ in range(bifpn_repeats) - ] - p3_w1 = [ - torch.tensor([-1e4, -1e4], dtype=torch.float64) - for _ in range(bifpn_repeats) - ] - p4_w2 = [ - torch.tensor([-1e4, -1e4, -1e4], dtype=torch.float64) - for _ in range(bifpn_repeats) - ] - p5_w2 = [ - torch.tensor([-1e4, -1e4, -1e4], dtype=torch.float64) - for _ in range(bifpn_repeats) - ] - p6_w2 = [ - torch.tensor([-1e4, -1e4, -1e4], dtype=torch.float64) - for _ in range(bifpn_repeats) - ] - p7_w2 = [ - torch.tensor([-1e4, -1e4], dtype=torch.float64) - for _ in range(bifpn_repeats) - ] - idx2key = { - 0: '1.0', - 1: '2.0', - 2: '2.1', - 3: '3.0', - 4: '3.1', - 5: '4.0', - 6: '4.1', - 7: '4.2', - 8: '4.3', - 9: '4.4', - 10: '4.5', - 11: '5.0', - 12: '5.1', - 13: '5.2', - 14: '5.3', - 15: '5.4' - } - m = dict() - for k, v in weights.items(): - - if 'Exponential' in k or 'global_step' in k: - continue - - seg = k.split('/') - if len(seg) == 1: - continue - if seg[2] == 'depthwise_conv2d': - v = v.transpose(1, 0) - - if seg[0] == model_name: - if seg[1] == 'stem': - prefix = 'backbone.layers.0' - mapping = { - 'conv2d/kernel': 'conv.weight', - 'tpu_batch_normalization/beta': 'bn.bias', - 'tpu_batch_normalization/gamma': 'bn.weight', - 'tpu_batch_normalization/moving_mean': 'bn.running_mean', - 'tpu_batch_normalization/moving_variance': - 'bn.running_var', - } - suffix = mapping['/'.join(seg[2:])] - m[prefix + '.' + suffix] = v - - elif seg[1].startswith('blocks_'): - idx = int(seg[1][7:]) - prefix = '.'.join(['backbone', 'layers', idx2key[idx]]) - base_mapping = { - 'depthwise_conv2d/depthwise_kernel': - 'depthwise_conv.conv.weight', - 'se/conv2d/kernel': 'se.conv1.conv.weight', - 'se/conv2d/bias': 'se.conv1.conv.bias', - 'se/conv2d_1/kernel': 'se.conv2.conv.weight', - 'se/conv2d_1/bias': 'se.conv2.conv.bias' - } - if idx == 0: - mapping = { - 'conv2d/kernel': - 'linear_conv.conv.weight', - 'tpu_batch_normalization/beta': - 'depthwise_conv.bn.bias', - 'tpu_batch_normalization/gamma': - 'depthwise_conv.bn.weight', - 'tpu_batch_normalization/moving_mean': - 'depthwise_conv.bn.running_mean', - 'tpu_batch_normalization/moving_variance': - 'depthwise_conv.bn.running_var', - 'tpu_batch_normalization_1/beta': - 'linear_conv.bn.bias', - 'tpu_batch_normalization_1/gamma': - 'linear_conv.bn.weight', - 'tpu_batch_normalization_1/moving_mean': - 'linear_conv.bn.running_mean', - 'tpu_batch_normalization_1/moving_variance': - 'linear_conv.bn.running_var', - } - else: - mapping = { - 'depthwise_conv2d/depthwise_kernel': - 'depthwise_conv.conv.weight', - 'conv2d/kernel': - 'expand_conv.conv.weight', - 'conv2d_1/kernel': - 'linear_conv.conv.weight', - 'tpu_batch_normalization/beta': - 'expand_conv.bn.bias', - 'tpu_batch_normalization/gamma': - 'expand_conv.bn.weight', - 'tpu_batch_normalization/moving_mean': - 'expand_conv.bn.running_mean', - 'tpu_batch_normalization/moving_variance': - 'expand_conv.bn.running_var', - 'tpu_batch_normalization_1/beta': - 'depthwise_conv.bn.bias', - 'tpu_batch_normalization_1/gamma': - 'depthwise_conv.bn.weight', - 'tpu_batch_normalization_1/moving_mean': - 'depthwise_conv.bn.running_mean', - 'tpu_batch_normalization_1/moving_variance': - 'depthwise_conv.bn.running_var', - 'tpu_batch_normalization_2/beta': - 'linear_conv.bn.bias', - 'tpu_batch_normalization_2/gamma': - 'linear_conv.bn.weight', - 'tpu_batch_normalization_2/moving_mean': - 'linear_conv.bn.running_mean', - 'tpu_batch_normalization_2/moving_variance': - 'linear_conv.bn.running_var', - } - mapping.update(base_mapping) - suffix = mapping['/'.join(seg[2:])] - m[prefix + '.' + suffix] = v - elif seg[0] == 'resample_p6': - prefix = 'neck.bifpn.0.p5_to_p6.0' - mapping = { - 'conv2d/kernel': 'down_conv.weight', - 'conv2d/bias': 'down_conv.bias', - 'bn/beta': 'bn.bias', - 'bn/gamma': 'bn.weight', - 'bn/moving_mean': 'bn.running_mean', - 'bn/moving_variance': 'bn.running_var', - } - suffix = mapping['/'.join(seg[1:])] - m[prefix + '.' + suffix] = v - elif seg[0] == 'fpn_cells': - fpn_idx = int(seg[1][5:]) - prefix = '.'.join(['neck', 'bifpn', str(fpn_idx)]) - fnode_id = int(seg[2][5]) - if fnode_id == 0: - mapping = { - 'op_after_combine5/conv/depthwise_kernel': - 'conv6_up.depthwise_conv.weight', - 'op_after_combine5/conv/pointwise_kernel': - 'conv6_up.pointwise_conv.weight', - 'op_after_combine5/conv/bias': - 'conv6_up.pointwise_conv.bias', - 'op_after_combine5/bn/beta': - 'conv6_up.bn.bias', - 'op_after_combine5/bn/gamma': - 'conv6_up.bn.weight', - 'op_after_combine5/bn/moving_mean': - 'conv6_up.bn.running_mean', - 'op_after_combine5/bn/moving_variance': - 'conv6_up.bn.running_var', - } - if seg[3] != 'WSM' and seg[3] != 'WSM_1': - suffix = mapping['/'.join(seg[3:])] - if 'depthwise_conv' in suffix: - v = v.transpose(1, 0) - m[prefix + '.' + suffix] = v - elif seg[3] == 'WSM': - p6_w1[fpn_idx][0] = v - elif seg[3] == 'WSM_1': - p6_w1[fpn_idx][1] = v - if torch.min(p6_w1[fpn_idx]) > -1e4: - m[prefix + '.p6_w1'] = p6_w1[fpn_idx] - elif fnode_id == 1: - base_mapping = { - 'op_after_combine6/conv/depthwise_kernel': - 'conv5_up.depthwise_conv.weight', - 'op_after_combine6/conv/pointwise_kernel': - 'conv5_up.pointwise_conv.weight', - 'op_after_combine6/conv/bias': - 'conv5_up.pointwise_conv.bias', - 'op_after_combine6/bn/beta': - 'conv5_up.bn.bias', - 'op_after_combine6/bn/gamma': - 'conv5_up.bn.weight', - 'op_after_combine6/bn/moving_mean': - 'conv5_up.bn.running_mean', - 'op_after_combine6/bn/moving_variance': - 'conv5_up.bn.running_var', - } - if fpn_idx == 0: - mapping = { - 'resample_0_2_6/conv2d/kernel': - 'p5_down_channel.down_conv.weight', - 'resample_0_2_6/conv2d/bias': - 'p5_down_channel.down_conv.bias', - 'resample_0_2_6/bn/beta': - 'p5_down_channel.bn.bias', - 'resample_0_2_6/bn/gamma': - 'p5_down_channel.bn.weight', - 'resample_0_2_6/bn/moving_mean': - 'p5_down_channel.bn.running_mean', - 'resample_0_2_6/bn/moving_variance': - 'p5_down_channel.bn.running_var', - } - base_mapping.update(mapping) - if seg[3] != 'WSM' and seg[3] != 'WSM_1': - suffix = base_mapping['/'.join(seg[3:])] - if 'depthwise_conv' in suffix: - v = v.transpose(1, 0) - m[prefix + '.' + suffix] = v - elif seg[3] == 'WSM': - p5_w1[fpn_idx][0] = v - elif seg[3] == 'WSM_1': - p5_w1[fpn_idx][1] = v - if torch.min(p5_w1[fpn_idx]) > -1e4: - m[prefix + '.p5_w1'] = p5_w1[fpn_idx] - elif fnode_id == 2: - base_mapping = { - 'op_after_combine7/conv/depthwise_kernel': - 'conv4_up.depthwise_conv.weight', - 'op_after_combine7/conv/pointwise_kernel': - 'conv4_up.pointwise_conv.weight', - 'op_after_combine7/conv/bias': - 'conv4_up.pointwise_conv.bias', - 'op_after_combine7/bn/beta': - 'conv4_up.bn.bias', - 'op_after_combine7/bn/gamma': - 'conv4_up.bn.weight', - 'op_after_combine7/bn/moving_mean': - 'conv4_up.bn.running_mean', - 'op_after_combine7/bn/moving_variance': - 'conv4_up.bn.running_var', - } - if fpn_idx == 0: - mapping = { - 'resample_0_1_7/conv2d/kernel': - 'p4_down_channel.down_conv.weight', - 'resample_0_1_7/conv2d/bias': - 'p4_down_channel.down_conv.bias', - 'resample_0_1_7/bn/beta': - 'p4_down_channel.bn.bias', - 'resample_0_1_7/bn/gamma': - 'p4_down_channel.bn.weight', - 'resample_0_1_7/bn/moving_mean': - 'p4_down_channel.bn.running_mean', - 'resample_0_1_7/bn/moving_variance': - 'p4_down_channel.bn.running_var', - } - base_mapping.update(mapping) - if seg[3] != 'WSM' and seg[3] != 'WSM_1': - suffix = base_mapping['/'.join(seg[3:])] - if 'depthwise_conv' in suffix: - v = v.transpose(1, 0) - m[prefix + '.' + suffix] = v - elif seg[3] == 'WSM': - p4_w1[fpn_idx][0] = v - elif seg[3] == 'WSM_1': - p4_w1[fpn_idx][1] = v - if torch.min(p4_w1[fpn_idx]) > -1e4: - m[prefix + '.p4_w1'] = p4_w1[fpn_idx] - elif fnode_id == 3: - - base_mapping = { - 'op_after_combine8/conv/depthwise_kernel': - 'conv3_up.depthwise_conv.weight', - 'op_after_combine8/conv/pointwise_kernel': - 'conv3_up.pointwise_conv.weight', - 'op_after_combine8/conv/bias': - 'conv3_up.pointwise_conv.bias', - 'op_after_combine8/bn/beta': - 'conv3_up.bn.bias', - 'op_after_combine8/bn/gamma': - 'conv3_up.bn.weight', - 'op_after_combine8/bn/moving_mean': - 'conv3_up.bn.running_mean', - 'op_after_combine8/bn/moving_variance': - 'conv3_up.bn.running_var', - } - if fpn_idx == 0: - mapping = { - 'resample_0_0_8/conv2d/kernel': - 'p3_down_channel.down_conv.weight', - 'resample_0_0_8/conv2d/bias': - 'p3_down_channel.down_conv.bias', - 'resample_0_0_8/bn/beta': - 'p3_down_channel.bn.bias', - 'resample_0_0_8/bn/gamma': - 'p3_down_channel.bn.weight', - 'resample_0_0_8/bn/moving_mean': - 'p3_down_channel.bn.running_mean', - 'resample_0_0_8/bn/moving_variance': - 'p3_down_channel.bn.running_var', - } - base_mapping.update(mapping) - if seg[3] != 'WSM' and seg[3] != 'WSM_1': - suffix = base_mapping['/'.join(seg[3:])] - if 'depthwise_conv' in suffix: - v = v.transpose(1, 0) - m[prefix + '.' + suffix] = v - elif seg[3] == 'WSM': - p3_w1[fpn_idx][0] = v - elif seg[3] == 'WSM_1': - p3_w1[fpn_idx][1] = v - if torch.min(p3_w1[fpn_idx]) > -1e4: - m[prefix + '.p3_w1'] = p3_w1[fpn_idx] - elif fnode_id == 4: - base_mapping = { - 'op_after_combine9/conv/depthwise_kernel': - 'conv4_down.depthwise_conv.weight', - 'op_after_combine9/conv/pointwise_kernel': - 'conv4_down.pointwise_conv.weight', - 'op_after_combine9/conv/bias': - 'conv4_down.pointwise_conv.bias', - 'op_after_combine9/bn/beta': - 'conv4_down.bn.bias', - 'op_after_combine9/bn/gamma': - 'conv4_down.bn.weight', - 'op_after_combine9/bn/moving_mean': - 'conv4_down.bn.running_mean', - 'op_after_combine9/bn/moving_variance': - 'conv4_down.bn.running_var', - } - if fpn_idx == 0: - mapping = { - 'resample_0_1_9/conv2d/kernel': - 'p4_level_connection.down_conv.weight', - 'resample_0_1_9/conv2d/bias': - 'p4_level_connection.down_conv.bias', - 'resample_0_1_9/bn/beta': - 'p4_level_connection.bn.bias', - 'resample_0_1_9/bn/gamma': - 'p4_level_connection.bn.weight', - 'resample_0_1_9/bn/moving_mean': - 'p4_level_connection.bn.running_mean', - 'resample_0_1_9/bn/moving_variance': - 'p4_level_connection.bn.running_var', - } - base_mapping.update(mapping) - if seg[3] != 'WSM' and seg[3] != 'WSM_1' and seg[3] != 'WSM_2': - suffix = base_mapping['/'.join(seg[3:])] - if 'depthwise_conv' in suffix: - v = v.transpose(1, 0) - m[prefix + '.' + suffix] = v - elif seg[3] == 'WSM': - p4_w2[fpn_idx][0] = v - elif seg[3] == 'WSM_1': - p4_w2[fpn_idx][1] = v - elif seg[3] == 'WSM_2': - p4_w2[fpn_idx][2] = v - if torch.min(p4_w2[fpn_idx]) > -1e4: - m[prefix + '.p4_w2'] = p4_w2[fpn_idx] - elif fnode_id == 5: - base_mapping = { - 'op_after_combine10/conv/depthwise_kernel': - 'conv5_down.depthwise_conv.weight', - 'op_after_combine10/conv/pointwise_kernel': - 'conv5_down.pointwise_conv.weight', - 'op_after_combine10/conv/bias': - 'conv5_down.pointwise_conv.bias', - 'op_after_combine10/bn/beta': - 'conv5_down.bn.bias', - 'op_after_combine10/bn/gamma': - 'conv5_down.bn.weight', - 'op_after_combine10/bn/moving_mean': - 'conv5_down.bn.running_mean', - 'op_after_combine10/bn/moving_variance': - 'conv5_down.bn.running_var', - } - if fpn_idx == 0: - mapping = { - 'resample_0_2_10/conv2d/kernel': - 'p5_level_connection.down_conv.weight', - 'resample_0_2_10/conv2d/bias': - 'p5_level_connection.down_conv.bias', - 'resample_0_2_10/bn/beta': - 'p5_level_connection.bn.bias', - 'resample_0_2_10/bn/gamma': - 'p5_level_connection.bn.weight', - 'resample_0_2_10/bn/moving_mean': - 'p5_level_connection.bn.running_mean', - 'resample_0_2_10/bn/moving_variance': - 'p5_level_connection.bn.running_var', - } - base_mapping.update(mapping) - if seg[3] != 'WSM' and seg[3] != 'WSM_1' and seg[3] != 'WSM_2': - suffix = base_mapping['/'.join(seg[3:])] - if 'depthwise_conv' in suffix: - v = v.transpose(1, 0) - m[prefix + '.' + suffix] = v - elif seg[3] == 'WSM': - p5_w2[fpn_idx][0] = v - elif seg[3] == 'WSM_1': - p5_w2[fpn_idx][1] = v - elif seg[3] == 'WSM_2': - p5_w2[fpn_idx][2] = v - if torch.min(p5_w2[fpn_idx]) > -1e4: - m[prefix + '.p5_w2'] = p5_w2[fpn_idx] - elif fnode_id == 6: - base_mapping = { - 'op_after_combine11/conv/depthwise_kernel': - 'conv6_down.depthwise_conv.weight', - 'op_after_combine11/conv/pointwise_kernel': - 'conv6_down.pointwise_conv.weight', - 'op_after_combine11/conv/bias': - 'conv6_down.pointwise_conv.bias', - 'op_after_combine11/bn/beta': - 'conv6_down.bn.bias', - 'op_after_combine11/bn/gamma': - 'conv6_down.bn.weight', - 'op_after_combine11/bn/moving_mean': - 'conv6_down.bn.running_mean', - 'op_after_combine11/bn/moving_variance': - 'conv6_down.bn.running_var', - } - if seg[3] != 'WSM' and seg[3] != 'WSM_1' and seg[3] != 'WSM_2': - suffix = base_mapping['/'.join(seg[3:])] - if 'depthwise_conv' in suffix: - v = v.transpose(1, 0) - m[prefix + '.' + suffix] = v - elif seg[3] == 'WSM': - p6_w2[fpn_idx][0] = v - elif seg[3] == 'WSM_1': - p6_w2[fpn_idx][1] = v - elif seg[3] == 'WSM_2': - p6_w2[fpn_idx][2] = v - if torch.min(p6_w2[fpn_idx]) > -1e4: - m[prefix + '.p6_w2'] = p6_w2[fpn_idx] - elif fnode_id == 7: - base_mapping = { - 'op_after_combine12/conv/depthwise_kernel': - 'conv7_down.depthwise_conv.weight', - 'op_after_combine12/conv/pointwise_kernel': - 'conv7_down.pointwise_conv.weight', - 'op_after_combine12/conv/bias': - 'conv7_down.pointwise_conv.bias', - 'op_after_combine12/bn/beta': - 'conv7_down.bn.bias', - 'op_after_combine12/bn/gamma': - 'conv7_down.bn.weight', - 'op_after_combine12/bn/moving_mean': - 'conv7_down.bn.running_mean', - 'op_after_combine12/bn/moving_variance': - 'conv7_down.bn.running_var', - } - if seg[3] != 'WSM' and seg[3] != 'WSM_1' and seg[3] != 'WSM_2': - suffix = base_mapping['/'.join(seg[3:])] - if 'depthwise_conv' in suffix: - v = v.transpose(1, 0) - m[prefix + '.' + suffix] = v - elif seg[3] == 'WSM': - p7_w2[fpn_idx][0] = v - elif seg[3] == 'WSM_1': - p7_w2[fpn_idx][1] = v - if torch.min(p7_w2[fpn_idx]) > -1e4: - m[prefix + '.p7_w2'] = p7_w2[fpn_idx] - elif seg[0] == 'box_net': - if 'box-predict' in seg[1]: - prefix = '.'.join(['bbox_head', 'reg_header']) - base_mapping = { - 'depthwise_kernel': 'depthwise_conv.weight', - 'pointwise_kernel': 'pointwise_conv.weight', - 'bias': 'pointwise_conv.bias' - } - suffix = base_mapping['/'.join(seg[2:])] - if 'depthwise_conv' in suffix: - v = v.transpose(1, 0) - m[prefix + '.' + suffix] = v - elif 'bn' in seg[1]: - bbox_conv_idx = int(seg[1][4]) - bbox_bn_idx = int(seg[1][9]) - 3 - prefix = '.'.join([ - 'bbox_head', 'reg_bn_list', - str(bbox_conv_idx), - str(bbox_bn_idx) - ]) - base_mapping = { - 'beta': 'bias', - 'gamma': 'weight', - 'moving_mean': 'running_mean', - 'moving_variance': 'running_var' - } - suffix = base_mapping['/'.join(seg[2:])] - m[prefix + '.' + suffix] = v - else: - bbox_conv_idx = int(seg[1][4]) - prefix = '.'.join( - ['bbox_head', 'reg_conv_list', - str(bbox_conv_idx)]) - base_mapping = { - 'depthwise_kernel': 'depthwise_conv.weight', - 'pointwise_kernel': 'pointwise_conv.weight', - 'bias': 'pointwise_conv.bias' - } - suffix = base_mapping['/'.join(seg[2:])] - if 'depthwise_conv' in suffix: - v = v.transpose(1, 0) - m[prefix + '.' + suffix] = v - elif seg[0] == 'class_net': - if 'class-predict' in seg[1]: - prefix = '.'.join(['bbox_head', 'cls_header']) - base_mapping = { - 'depthwise_kernel': 'depthwise_conv.weight', - 'pointwise_kernel': 'pointwise_conv.weight', - 'bias': 'pointwise_conv.bias' - } - suffix = base_mapping['/'.join(seg[2:])] - if 'depthwise_conv' in suffix: - v = v.transpose(1, 0) - m[prefix + '.' + suffix] = v - elif 'bn' in seg[1]: - cls_conv_idx = int(seg[1][6]) - cls_bn_idx = int(seg[1][11]) - 3 - prefix = '.'.join([ - 'bbox_head', 'cls_bn_list', - str(cls_conv_idx), - str(cls_bn_idx) - ]) - base_mapping = { - 'beta': 'bias', - 'gamma': 'weight', - 'moving_mean': 'running_mean', - 'moving_variance': 'running_var' - } - suffix = base_mapping['/'.join(seg[2:])] - m[prefix + '.' + suffix] = v - else: - cls_conv_idx = int(seg[1][6]) - prefix = '.'.join( - ['bbox_head', 'cls_conv_list', - str(cls_conv_idx)]) - base_mapping = { - 'depthwise_kernel': 'depthwise_conv.weight', - 'pointwise_kernel': 'pointwise_conv.weight', - 'bias': 'pointwise_conv.bias' - } - suffix = base_mapping['/'.join(seg[2:])] - if 'depthwise_conv' in suffix: - v = v.transpose(1, 0) - m[prefix + '.' + suffix] = v - return m - - -def parse_args(): - parser = argparse.ArgumentParser( - description='convert efficientdet weight from tensorflow to pytorch') - parser.add_argument( - '--backbone', - type=str, - help='efficientnet model name, like efficientnet-b0') - parser.add_argument( - '--tensorflow_weight', - type=str, - help='efficientdet tensorflow weight name, like efficientdet-d0/model') - parser.add_argument( - '--out_weight', - type=str, - help='efficientdet pytorch weight name like demo.pth') - args = parser.parse_args() - return args - - -def main(): - args = parse_args() - model_name = args.backbone - ori_weight_name = args.tensorflow_weight - out_name = args.out_weight - - repeat_map = { - 0: 3, - 1: 4, - 2: 5, - 3: 6, - 4: 7, - 5: 7, - 6: 8, - 7: 8, - } - - reader = py_checkpoint_reader.NewCheckpointReader(ori_weight_name) - weights = { - n: torch.as_tensor(tf2pth(reader.get_tensor(n))) - for (n, _) in reader.get_variable_to_shape_map().items() - } - bifpn_repeats = repeat_map[int(model_name[14])] - out = convert_key(model_name, bifpn_repeats, weights) - result = {'state_dict': out} - torch.save(result, out_name) - - -if __name__ == '__main__': - main() diff --git a/projects/EfficientDet/efficientdet/__init__.py b/projects/EfficientDet/efficientdet/__init__.py deleted file mode 100644 index b6c66bcc3..000000000 --- a/projects/EfficientDet/efficientdet/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -from .bifpn import BiFPN -from .efficientdet import EfficientDet -from .efficientdet_head import EfficientDetSepBNHead -from .huber_loss import HuberLoss -from .tensorflow.anchor_generator import YXYXAnchorGenerator -from .tensorflow.coco_90class import Coco90Dataset -from .tensorflow.coco_90metric import Coco90Metric -from .tensorflow.trans_max_iou_assigner import TransMaxIoUAssigner -from .tensorflow.yxyx_bbox_coder import YXYXDeltaXYWHBBoxCoder -from .utils import Conv2dSamePadding - -__all__ = [ - 'EfficientDet', 'BiFPN', 'HuberLoss', 'EfficientDetSepBNHead', - 'Conv2dSamePadding', 'Coco90Dataset', 'Coco90Metric', - 'YXYXAnchorGenerator', 'TransMaxIoUAssigner', 'YXYXDeltaXYWHBBoxCoder' -] diff --git a/projects/EfficientDet/efficientdet/bifpn.py b/projects/EfficientDet/efficientdet/bifpn.py deleted file mode 100644 index 56356c3c5..000000000 --- a/projects/EfficientDet/efficientdet/bifpn.py +++ /dev/null @@ -1,306 +0,0 @@ -from typing import List - -import torch -import torch.nn as nn -from mmcv.cnn.bricks import Swish -from mmengine.model import BaseModule - -from mmdet.registry import MODELS -from mmdet.utils import MultiConfig, OptConfigType -from .utils import DepthWiseConvBlock, DownChannelBlock, MaxPool2dSamePadding - - -class BiFPNStage(nn.Module): - """ - in_channels: List[int], input dim for P3, P4, P5 - out_channels: int, output dim for P2 - P7 - first_time: int, whether is the first bifpnstage - conv_bn_act_pattern: bool, whether use conv_bn_act_pattern - norm_cfg: (:obj:`ConfigDict` or dict, optional): Config dict for - normalization layer. - epsilon: float, hyperparameter in fusion features - """ - - def __init__(self, - in_channels: List[int], - out_channels: int, - first_time: bool = False, - apply_bn_for_resampling: bool = True, - conv_bn_act_pattern: bool = False, - norm_cfg: OptConfigType = dict( - type='BN', momentum=1e-2, eps=1e-3), - epsilon: float = 1e-4) -> None: - super().__init__() - assert isinstance(in_channels, list) - self.in_channels = in_channels - self.out_channels = out_channels - self.first_time = first_time - self.apply_bn_for_resampling = apply_bn_for_resampling - self.conv_bn_act_pattern = conv_bn_act_pattern - self.norm_cfg = norm_cfg - self.epsilon = epsilon - - if self.first_time: - self.p5_down_channel = DownChannelBlock( - self.in_channels[-1], - self.out_channels, - apply_norm=self.apply_bn_for_resampling, - conv_bn_act_pattern=self.conv_bn_act_pattern, - norm_cfg=norm_cfg) - self.p4_down_channel = DownChannelBlock( - self.in_channels[-2], - self.out_channels, - apply_norm=self.apply_bn_for_resampling, - conv_bn_act_pattern=self.conv_bn_act_pattern, - norm_cfg=norm_cfg) - self.p3_down_channel = DownChannelBlock( - self.in_channels[-3], - self.out_channels, - apply_norm=self.apply_bn_for_resampling, - conv_bn_act_pattern=self.conv_bn_act_pattern, - norm_cfg=norm_cfg) - self.p5_to_p6 = nn.Sequential( - DownChannelBlock( - self.in_channels[-1], - self.out_channels, - apply_norm=self.apply_bn_for_resampling, - conv_bn_act_pattern=self.conv_bn_act_pattern, - norm_cfg=norm_cfg), MaxPool2dSamePadding(3, 2)) - self.p6_to_p7 = MaxPool2dSamePadding(3, 2) - self.p4_level_connection = DownChannelBlock( - self.in_channels[-2], - self.out_channels, - apply_norm=self.apply_bn_for_resampling, - conv_bn_act_pattern=self.conv_bn_act_pattern, - norm_cfg=norm_cfg) - self.p5_level_connection = DownChannelBlock( - self.in_channels[-1], - self.out_channels, - apply_norm=self.apply_bn_for_resampling, - conv_bn_act_pattern=self.conv_bn_act_pattern, - norm_cfg=norm_cfg) - - self.p6_upsample = nn.Upsample(scale_factor=2, mode='nearest') - self.p5_upsample = nn.Upsample(scale_factor=2, mode='nearest') - self.p4_upsample = nn.Upsample(scale_factor=2, mode='nearest') - self.p3_upsample = nn.Upsample(scale_factor=2, mode='nearest') - - # bottom to up: feature map down_sample module - self.p4_down_sample = MaxPool2dSamePadding(3, 2) - self.p5_down_sample = MaxPool2dSamePadding(3, 2) - self.p6_down_sample = MaxPool2dSamePadding(3, 2) - self.p7_down_sample = MaxPool2dSamePadding(3, 2) - - # Fuse Conv Layers - self.conv6_up = DepthWiseConvBlock( - out_channels, - out_channels, - apply_norm=self.apply_bn_for_resampling, - conv_bn_act_pattern=self.conv_bn_act_pattern, - norm_cfg=norm_cfg) - self.conv5_up = DepthWiseConvBlock( - out_channels, - out_channels, - apply_norm=self.apply_bn_for_resampling, - conv_bn_act_pattern=self.conv_bn_act_pattern, - norm_cfg=norm_cfg) - self.conv4_up = DepthWiseConvBlock( - out_channels, - out_channels, - apply_norm=self.apply_bn_for_resampling, - conv_bn_act_pattern=self.conv_bn_act_pattern, - norm_cfg=norm_cfg) - self.conv3_up = DepthWiseConvBlock( - out_channels, - out_channels, - apply_norm=self.apply_bn_for_resampling, - conv_bn_act_pattern=self.conv_bn_act_pattern, - norm_cfg=norm_cfg) - self.conv4_down = DepthWiseConvBlock( - out_channels, - out_channels, - apply_norm=self.apply_bn_for_resampling, - conv_bn_act_pattern=self.conv_bn_act_pattern, - norm_cfg=norm_cfg) - self.conv5_down = DepthWiseConvBlock( - out_channels, - out_channels, - apply_norm=self.apply_bn_for_resampling, - conv_bn_act_pattern=self.conv_bn_act_pattern, - norm_cfg=norm_cfg) - self.conv6_down = DepthWiseConvBlock( - out_channels, - out_channels, - apply_norm=self.apply_bn_for_resampling, - conv_bn_act_pattern=self.conv_bn_act_pattern, - norm_cfg=norm_cfg) - self.conv7_down = DepthWiseConvBlock( - out_channels, - out_channels, - apply_norm=self.apply_bn_for_resampling, - conv_bn_act_pattern=self.conv_bn_act_pattern, - norm_cfg=norm_cfg) - # weights - self.p6_w1 = nn.Parameter( - torch.ones(2, dtype=torch.float32), requires_grad=True) - self.p6_w1_relu = nn.ReLU() - self.p5_w1 = nn.Parameter( - torch.ones(2, dtype=torch.float32), requires_grad=True) - self.p5_w1_relu = nn.ReLU() - self.p4_w1 = nn.Parameter( - torch.ones(2, dtype=torch.float32), requires_grad=True) - self.p4_w1_relu = nn.ReLU() - self.p3_w1 = nn.Parameter( - torch.ones(2, dtype=torch.float32), requires_grad=True) - self.p3_w1_relu = nn.ReLU() - - self.p4_w2 = nn.Parameter( - torch.ones(3, dtype=torch.float32), requires_grad=True) - self.p4_w2_relu = nn.ReLU() - self.p5_w2 = nn.Parameter( - torch.ones(3, dtype=torch.float32), requires_grad=True) - self.p5_w2_relu = nn.ReLU() - self.p6_w2 = nn.Parameter( - torch.ones(3, dtype=torch.float32), requires_grad=True) - self.p6_w2_relu = nn.ReLU() - self.p7_w2 = nn.Parameter( - torch.ones(2, dtype=torch.float32), requires_grad=True) - self.p7_w2_relu = nn.ReLU() - - self.swish = Swish() - - def combine(self, x): - if not self.conv_bn_act_pattern: - x = self.swish(x) - - return x - - def forward(self, x): - if self.first_time: - p3, p4, p5 = x - # build feature map P6 - p6_in = self.p5_to_p6(p5) - # build feature map P7 - p7_in = self.p6_to_p7(p6_in) - - p3_in = self.p3_down_channel(p3) - p4_in = self.p4_down_channel(p4) - p5_in = self.p5_down_channel(p5) - - else: - p3_in, p4_in, p5_in, p6_in, p7_in = x - - # Weights for P6_0 and P7_0 to P6_1 - p6_w1 = self.p6_w1_relu(self.p6_w1) - weight = p6_w1 / (torch.sum(p6_w1, dim=0) + self.epsilon) - # Connections for P6_0 and P7_0 to P6_1 respectively - p6_up = self.conv6_up( - self.combine(weight[0] * p6_in + - weight[1] * self.p6_upsample(p7_in))) - - # Weights for P5_0 and P6_1 to P5_1 - p5_w1 = self.p5_w1_relu(self.p5_w1) - weight = p5_w1 / (torch.sum(p5_w1, dim=0) + self.epsilon) - # Connections for P5_0 and P6_1 to P5_1 respectively - p5_up = self.conv5_up( - self.combine(weight[0] * p5_in + - weight[1] * self.p5_upsample(p6_up))) - - # Weights for P4_0 and P5_1 to P4_1 - p4_w1 = self.p4_w1_relu(self.p4_w1) - weight = p4_w1 / (torch.sum(p4_w1, dim=0) + self.epsilon) - # Connections for P4_0 and P5_1 to P4_1 respectively - p4_up = self.conv4_up( - self.combine(weight[0] * p4_in + - weight[1] * self.p4_upsample(p5_up))) - - # Weights for P3_0 and P4_1 to P3_2 - p3_w1 = self.p3_w1_relu(self.p3_w1) - weight = p3_w1 / (torch.sum(p3_w1, dim=0) + self.epsilon) - # Connections for P3_0 and P4_1 to P3_2 respectively - p3_out = self.conv3_up( - self.combine(weight[0] * p3_in + - weight[1] * self.p3_upsample(p4_up))) - - if self.first_time: - p4_in = self.p4_level_connection(p4) - p5_in = self.p5_level_connection(p5) - - # Weights for P4_0, P4_1 and P3_2 to P4_2 - p4_w2 = self.p4_w2_relu(self.p4_w2) - weight = p4_w2 / (torch.sum(p4_w2, dim=0) + self.epsilon) - # Connections for P4_0, P4_1 and P3_2 to P4_2 respectively - p4_out = self.conv4_down( - self.combine(weight[0] * p4_in + weight[1] * p4_up + - weight[2] * self.p4_down_sample(p3_out))) - - # Weights for P5_0, P5_1 and P4_2 to P5_2 - p5_w2 = self.p5_w2_relu(self.p5_w2) - weight = p5_w2 / (torch.sum(p5_w2, dim=0) + self.epsilon) - # Connections for P5_0, P5_1 and P4_2 to P5_2 respectively - p5_out = self.conv5_down( - self.combine(weight[0] * p5_in + weight[1] * p5_up + - weight[2] * self.p5_down_sample(p4_out))) - - # Weights for P6_0, P6_1 and P5_2 to P6_2 - p6_w2 = self.p6_w2_relu(self.p6_w2) - weight = p6_w2 / (torch.sum(p6_w2, dim=0) + self.epsilon) - # Connections for P6_0, P6_1 and P5_2 to P6_2 respectively - p6_out = self.conv6_down( - self.combine(weight[0] * p6_in + weight[1] * p6_up + - weight[2] * self.p6_down_sample(p5_out))) - - # Weights for P7_0 and P6_2 to P7_2 - p7_w2 = self.p7_w2_relu(self.p7_w2) - weight = p7_w2 / (torch.sum(p7_w2, dim=0) + self.epsilon) - # Connections for P7_0 and P6_2 to P7_2 - p7_out = self.conv7_down( - self.combine(weight[0] * p7_in + - weight[1] * self.p7_down_sample(p6_out))) - return p3_out, p4_out, p5_out, p6_out, p7_out - - -@MODELS.register_module() -class BiFPN(BaseModule): - """ - num_stages: int, bifpn number of repeats - in_channels: List[int], input dim for P3, P4, P5 - out_channels: int, output dim for P2 - P7 - start_level: int, Index of input features in backbone - epsilon: float, hyperparameter in fusion features - apply_bn_for_resampling: bool, whether use bn after resampling - conv_bn_act_pattern: bool, whether use conv_bn_act_pattern - norm_cfg: (:obj:`ConfigDict` or dict, optional): Config dict for - normalization layer. - init_cfg: MultiConfig: init method - """ - - def __init__(self, - num_stages: int, - in_channels: List[int], - out_channels: int, - start_level: int = 0, - epsilon: float = 1e-4, - apply_bn_for_resampling: bool = True, - conv_bn_act_pattern: bool = False, - norm_cfg: OptConfigType = dict( - type='BN', momentum=1e-2, eps=1e-3), - init_cfg: MultiConfig = None) -> None: - super().__init__(init_cfg=init_cfg) - self.start_level = start_level - self.bifpn = nn.Sequential(*[ - BiFPNStage( - in_channels=in_channels, - out_channels=out_channels, - first_time=True if _ == 0 else False, - apply_bn_for_resampling=apply_bn_for_resampling, - conv_bn_act_pattern=conv_bn_act_pattern, - norm_cfg=norm_cfg, - epsilon=epsilon) for _ in range(num_stages) - ]) - - def forward(self, x): - x = x[self.start_level:] - x = self.bifpn(x) - - return x diff --git a/projects/EfficientDet/efficientdet/efficientdet.py b/projects/EfficientDet/efficientdet/efficientdet.py deleted file mode 100644 index 84e1778f5..000000000 --- a/projects/EfficientDet/efficientdet/efficientdet.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from mmdet.models.detectors.single_stage import SingleStageDetector -from mmdet.registry import MODELS -from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig - - -@MODELS.register_module() -class EfficientDet(SingleStageDetector): - - def __init__(self, - backbone: ConfigType, - neck: ConfigType, - bbox_head: ConfigType, - train_cfg: OptConfigType = None, - test_cfg: OptConfigType = None, - data_preprocessor: OptConfigType = None, - init_cfg: OptMultiConfig = None) -> None: - super().__init__( - backbone=backbone, - neck=neck, - bbox_head=bbox_head, - train_cfg=train_cfg, - test_cfg=test_cfg, - data_preprocessor=data_preprocessor, - init_cfg=init_cfg) diff --git a/projects/EfficientDet/efficientdet/efficientdet_head.py b/projects/EfficientDet/efficientdet/efficientdet_head.py deleted file mode 100644 index ae3efbe2c..000000000 --- a/projects/EfficientDet/efficientdet/efficientdet_head.py +++ /dev/null @@ -1,261 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from typing import List, Tuple - -import torch -import torch.nn as nn -from mmcv.cnn.bricks import Swish, build_norm_layer -from mmengine.model import bias_init_with_prob -from torch import Tensor - -from mmdet.models.dense_heads.anchor_head import AnchorHead -from mmdet.models.utils import images_to_levels, multi_apply -from mmdet.registry import MODELS -from mmdet.structures.bbox import cat_boxes, get_box_tensor -from mmdet.utils import (InstanceList, OptConfigType, OptInstanceList, - OptMultiConfig, reduce_mean) -from .utils import DepthWiseConvBlock - - -@MODELS.register_module() -class EfficientDetSepBNHead(AnchorHead): - """EfficientDetHead with separate BN. - - num_classes (int): Number of categories num_ins (int): Number of the input - feature map. in_channels (int): Number of channels in the input feature - map. feat_channels (int): Number of hidden channels. stacked_convs (int): - Number of repetitions of conv norm_cfg (dict): Config dict for - normalization layer. anchor_generator (dict): Config dict for anchor - generator bbox_coder (dict): Config of bounding box coder. loss_cls (dict): - Config of classification loss. loss_bbox (dict): Config of localization - loss. train_cfg (dict): Training config of anchor head. test_cfg (dict): - Testing config of anchor head. init_cfg (dict or list[dict], optional): - Initialization config dict. - """ - - def __init__(self, - num_classes: int, - num_ins: int, - in_channels: int, - feat_channels: int, - stacked_convs: int = 3, - norm_cfg: OptConfigType = dict( - type='BN', momentum=1e-2, eps=1e-3), - init_cfg: OptMultiConfig = None, - **kwargs) -> None: - self.num_ins = num_ins - self.stacked_convs = stacked_convs - self.norm_cfg = norm_cfg - super().__init__( - num_classes=num_classes, - in_channels=in_channels, - feat_channels=feat_channels, - init_cfg=init_cfg, - **kwargs) - - def _init_layers(self) -> None: - """Initialize layers of the head.""" - self.reg_conv_list = nn.ModuleList() - self.cls_conv_list = nn.ModuleList() - for i in range(self.stacked_convs): - channels = self.in_channels if i == 0 else self.feat_channels - self.reg_conv_list.append( - DepthWiseConvBlock( - channels, self.feat_channels, apply_norm=False)) - self.cls_conv_list.append( - DepthWiseConvBlock( - channels, self.feat_channels, apply_norm=False)) - - self.reg_bn_list = nn.ModuleList([ - nn.ModuleList([ - build_norm_layer( - self.norm_cfg, num_features=self.feat_channels)[1] - for j in range(self.num_ins) - ]) for i in range(self.stacked_convs) - ]) - - self.cls_bn_list = nn.ModuleList([ - nn.ModuleList([ - build_norm_layer( - self.norm_cfg, num_features=self.feat_channels)[1] - for j in range(self.num_ins) - ]) for i in range(self.stacked_convs) - ]) - - self.cls_header = DepthWiseConvBlock( - self.in_channels, - self.num_base_priors * self.cls_out_channels, - apply_norm=False) - self.reg_header = DepthWiseConvBlock( - self.in_channels, self.num_base_priors * 4, apply_norm=False) - self.swish = Swish() - - def init_weights(self) -> None: - """Initialize weights of the head.""" - for m in self.reg_conv_list: - nn.init.constant_(m.pointwise_conv.bias, 0.0) - for m in self.cls_conv_list: - nn.init.constant_(m.pointwise_conv.bias, 0.0) - bias_cls = bias_init_with_prob(0.01) - nn.init.constant_(self.cls_header.pointwise_conv.bias, bias_cls) - nn.init.constant_(self.reg_header.pointwise_conv.bias, 0.0) - - def forward_single_bbox(self, feat: Tensor, level_id: int, - i: int) -> Tensor: - conv_op = self.reg_conv_list[i] - bn = self.reg_bn_list[i][level_id] - - feat = conv_op(feat) - feat = bn(feat) - feat = self.swish(feat) - - return feat - - def forward_single_cls(self, feat: Tensor, level_id: int, - i: int) -> Tensor: - conv_op = self.cls_conv_list[i] - bn = self.cls_bn_list[i][level_id] - - feat = conv_op(feat) - feat = bn(feat) - feat = self.swish(feat) - - return feat - - def forward(self, feats: Tuple[Tensor]) -> tuple: - cls_scores = [] - bbox_preds = [] - for level_id in range(self.num_ins): - feat = feats[level_id] - for i in range(self.stacked_convs): - feat = self.forward_single_bbox(feat, level_id, i) - bbox_pred = self.reg_header(feat) - bbox_preds.append(bbox_pred) - for level_id in range(self.num_ins): - feat = feats[level_id] - for i in range(self.stacked_convs): - feat = self.forward_single_cls(feat, level_id, i) - cls_score = self.cls_header(feat) - cls_scores.append(cls_score) - - return cls_scores, bbox_preds - - def loss_by_feat( - self, - cls_scores: List[Tensor], - bbox_preds: List[Tensor], - batch_gt_instances: InstanceList, - batch_img_metas: List[dict], - batch_gt_instances_ignore: OptInstanceList = None) -> dict: - """Calculate the loss based on the features extracted by the detection - head. - - Args: - cls_scores (list[Tensor]): Box scores for each scale level - has shape (N, num_anchors * num_classes, H, W). - bbox_preds (list[Tensor]): Box energies / deltas for each scale - level with shape (N, num_anchors * 4, H, W). - batch_gt_instances (list[:obj:`InstanceData`]): Batch of - gt_instance. It usually includes ``bboxes`` and ``labels`` - attributes. - batch_img_metas (list[dict]): Meta information of each image, e.g., - image size, scaling factor, etc. - batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): - Batch of gt_instances_ignore. It includes ``bboxes`` attribute - data that is ignored during training and testing. - Defaults to None. - - Returns: - dict: A dictionary of loss components. - """ - featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] - assert len(featmap_sizes) == self.prior_generator.num_levels - - device = cls_scores[0].device - - anchor_list, valid_flag_list = self.get_anchors( - featmap_sizes, batch_img_metas, device=device) - cls_reg_targets = self.get_targets( - anchor_list, - valid_flag_list, - batch_gt_instances, - batch_img_metas, - batch_gt_instances_ignore=batch_gt_instances_ignore) - (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, - avg_factor) = cls_reg_targets - - # anchor number of multi levels - num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]] - # concat all level anchors and flags to a single tensor - concat_anchor_list = [] - for i in range(len(anchor_list)): - concat_anchor_list.append(cat_boxes(anchor_list[i])) - all_anchor_list = images_to_levels(concat_anchor_list, - num_level_anchors) - - avg_factor = reduce_mean( - torch.tensor(avg_factor, dtype=torch.float, device=device)).item() - avg_factor = max(avg_factor, 1.0) - losses_cls, losses_bbox = multi_apply( - self.loss_by_feat_single, - cls_scores, - bbox_preds, - all_anchor_list, - labels_list, - label_weights_list, - bbox_targets_list, - bbox_weights_list, - avg_factor=avg_factor) - return dict(loss_cls=losses_cls, loss_bbox=losses_bbox) - - def loss_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor, - anchors: Tensor, labels: Tensor, - label_weights: Tensor, bbox_targets: Tensor, - bbox_weights: Tensor, avg_factor: int) -> tuple: - """Calculate the loss of a single scale level based on the features - extracted by the detection head. - - Args: - cls_score (Tensor): Box scores for each scale level - Has shape (N, num_anchors * num_classes, H, W). - bbox_pred (Tensor): Box energies / deltas for each scale - level with shape (N, num_anchors * 4, H, W). - anchors (Tensor): Box reference for each scale level with shape - (N, num_total_anchors, 4). - labels (Tensor): Labels of each anchors with shape - (N, num_total_anchors). - label_weights (Tensor): Label weights of each anchor with shape - (N, num_total_anchors) - bbox_targets (Tensor): BBox regression targets of each anchor - weight shape (N, num_total_anchors, 4). - bbox_weights (Tensor): BBox regression loss weights of each anchor - with shape (N, num_total_anchors, 4). - avg_factor (int): Average factor that is used to average the loss. - - Returns: - tuple: loss components. - """ - - # classification loss - labels = labels.reshape(-1) - label_weights = label_weights.reshape(-1) - cls_score = cls_score.permute(0, 2, 3, - 1).reshape(-1, self.cls_out_channels) - loss_cls = self.loss_cls( - cls_score, labels, label_weights, avg_factor=avg_factor) - # regression loss - target_dim = bbox_targets.size(-1) - bbox_targets = bbox_targets.reshape(-1, target_dim) - bbox_weights = bbox_weights.reshape(-1, target_dim) - bbox_pred = bbox_pred.permute(0, 2, 3, - 1).reshape(-1, - self.bbox_coder.encode_size) - if self.reg_decoded_bbox: - # When the regression loss (e.g. `IouLoss`, `GIouLoss`) - # is applied directly on the decoded bounding boxes, it - # decodes the already encoded coordinates to absolute format. - anchors = anchors.reshape(-1, anchors.size(-1)) - bbox_pred = self.bbox_coder.decode(anchors, bbox_pred) - bbox_pred = get_box_tensor(bbox_pred) - loss_bbox = self.loss_bbox( - bbox_pred, bbox_targets, bbox_weights, avg_factor=avg_factor * 4) - return loss_cls, loss_bbox diff --git a/projects/EfficientDet/efficientdet/huber_loss.py b/projects/EfficientDet/efficientdet/huber_loss.py deleted file mode 100644 index 091963fa9..000000000 --- a/projects/EfficientDet/efficientdet/huber_loss.py +++ /dev/null @@ -1,91 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from typing import Optional - -import torch -import torch.nn as nn -from torch import Tensor - -from mmdet.models.losses.utils import weighted_loss -from mmdet.registry import MODELS - - -@weighted_loss -def huber_loss(pred: Tensor, target: Tensor, beta: float = 1.0) -> Tensor: - """Huber loss. - - Args: - pred (Tensor): The prediction. - target (Tensor): The learning target of the prediction. - beta (float, optional): The threshold in the piecewise function. - Defaults to 1.0. - - Returns: - Tensor: Calculated loss - """ - assert beta > 0 - if target.numel() == 0: - return pred.sum() * 0 - - assert pred.size() == target.size() - diff = torch.abs(pred - target) - loss = torch.where(diff < beta, 0.5 * diff * diff, - beta * diff - 0.5 * beta * beta) - return loss - - -@MODELS.register_module() -class HuberLoss(nn.Module): - """Huber loss. - - Args: - beta (float, optional): The threshold in the piecewise function. - Defaults to 1.0. - reduction (str, optional): The method to reduce the loss. - Options are "none", "mean" and "sum". Defaults to "mean". - loss_weight (float, optional): The weight of loss. - """ - - def __init__(self, - beta: float = 1.0, - reduction: str = 'mean', - loss_weight: float = 1.0) -> None: - super().__init__() - self.beta = beta - self.reduction = reduction - self.loss_weight = loss_weight - - def forward(self, - pred: Tensor, - target: Tensor, - weight: Optional[Tensor] = None, - avg_factor: Optional[int] = None, - reduction_override: Optional[str] = None, - **kwargs) -> Tensor: - """Forward function. - - Args: - pred (Tensor): The prediction. - target (Tensor): The learning target of the prediction. - weight (Tensor, optional): The weight of loss for each - prediction. Defaults to None. - avg_factor (int, optional): Average factor that is used to average - the loss. Defaults to None. - reduction_override (str, optional): The reduction method used to - override the original reduction method of the loss. - Defaults to None. - - Returns: - Tensor: Calculated loss - """ - assert reduction_override in (None, 'none', 'mean', 'sum') - reduction = ( - reduction_override if reduction_override else self.reduction) - loss_bbox = self.loss_weight * huber_loss( - pred, - target, - weight, - beta=self.beta, - reduction=reduction, - avg_factor=avg_factor, - **kwargs) - return loss_bbox diff --git a/projects/EfficientDet/efficientdet/tensorflow/anchor_generator.py b/projects/EfficientDet/efficientdet/tensorflow/anchor_generator.py deleted file mode 100644 index 51936a348..000000000 --- a/projects/EfficientDet/efficientdet/tensorflow/anchor_generator.py +++ /dev/null @@ -1,109 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from typing import Optional, Tuple, Union - -import torch -from torch import Tensor - -from mmdet.models.task_modules.prior_generators.anchor_generator import \ - AnchorGenerator -from mmdet.registry import TASK_UTILS -from mmdet.structures.bbox import HorizontalBoxes - -DeviceType = Union[str, torch.device] - - -@TASK_UTILS.register_module() -class YXYXAnchorGenerator(AnchorGenerator): - - def gen_single_level_base_anchors(self, - base_size: Union[int, float], - scales: Tensor, - ratios: Tensor, - center: Optional[Tuple[float]] = None) \ - -> Tensor: - """Generate base anchors of a single level. - - Args: - base_size (int | float): Basic size of an anchor. - scales (torch.Tensor): Scales of the anchor. - ratios (torch.Tensor): The ratio between the height - and width of anchors in a single level. - center (tuple[float], optional): The center of the base anchor - related to a single feature grid. Defaults to None. - - Returns: - torch.Tensor: Anchors in a single-level feature maps. - """ - - w = base_size - h = base_size - if center is None: - x_center = self.center_offset * w - y_center = self.center_offset * h - else: - x_center, y_center = center - - h_ratios = torch.sqrt(ratios) - w_ratios = 1 / h_ratios - if self.scale_major: - ws = (w * scales[:, None] * w_ratios[None, :]).view(-1) - hs = (h * scales[:, None] * h_ratios[None, :]).view(-1) - else: - ws = (w * scales[:, None] * w_ratios[None, :]).view(-1) - hs = (h * scales[:, None] * h_ratios[None, :]).view(-1) - - # use float anchor and the anchor's center is aligned with the - # pixel center - base_anchors = [ - y_center - 0.5 * hs, - x_center - 0.5 * ws, - y_center + 0.5 * hs, - x_center + 0.5 * ws, - ] - base_anchors = torch.stack(base_anchors, dim=-1) - - return base_anchors - - def single_level_grid_priors(self, - featmap_size: Tuple[int, int], - level_idx: int, - dtype: torch.dtype = torch.float32, - device: DeviceType = 'cuda') -> Tensor: - """Generate grid anchors of a single level. - - Note: - This function is usually called by method ``self.grid_priors``. - - Args: - featmap_size (tuple[int, int]): Size of the feature maps. - level_idx (int): The index of corresponding feature map level. - dtype (obj:`torch.dtype`): Date type of points.Defaults to - ``torch.float32``. - device (str | torch.device): The device the tensor will be put on. - Defaults to 'cuda'. - - Returns: - torch.Tensor: Anchors in the overall feature maps. - """ - base_anchors = self.base_anchors[level_idx].to(device).to(dtype) - feat_h, feat_w = featmap_size - stride_w, stride_h = self.strides[level_idx] - # First create Range with the default dtype, than convert to - # target `dtype` for onnx exporting. - shift_x = torch.arange(0, feat_w, device=device).to(dtype) * stride_w - shift_y = torch.arange(0, feat_h, device=device).to(dtype) * stride_h - - shift_xx, shift_yy = self._meshgrid(shift_x, shift_y) - shifts = torch.stack([shift_yy, shift_xx, shift_yy, shift_xx], dim=-1) - # first feat_w elements correspond to the first row of shifts - # add A anchors (1, A, 4) to K shifts (K, 1, 4) to get - # shifted anchors (K, A, 4), reshape to (K*A, 4) - - all_anchors = base_anchors[None, :, :] + shifts[:, None, :] - all_anchors = all_anchors.view(-1, 4) - # first A rows correspond to A anchors of (0, 0) in feature map, - # then (0, 1), (0, 2), ... - if self.use_box_type: - all_anchors = HorizontalBoxes(all_anchors) - - return all_anchors diff --git a/projects/EfficientDet/efficientdet/tensorflow/api_wrappers/__init__.py b/projects/EfficientDet/efficientdet/tensorflow/api_wrappers/__init__.py deleted file mode 100644 index a27afc460..000000000 --- a/projects/EfficientDet/efficientdet/tensorflow/api_wrappers/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from .coco_api import COCO, COCOeval, COCOPanoptic - -__all__ = ['COCO', 'COCOeval', 'COCOPanoptic'] diff --git a/projects/EfficientDet/efficientdet/tensorflow/api_wrappers/coco_api.py b/projects/EfficientDet/efficientdet/tensorflow/api_wrappers/coco_api.py deleted file mode 100644 index 142f27d7f..000000000 --- a/projects/EfficientDet/efficientdet/tensorflow/api_wrappers/coco_api.py +++ /dev/null @@ -1,145 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -# This file add snake case alias for coco api - -import warnings -from collections import defaultdict -from typing import List, Optional, Union - -import pycocotools -from pycocotools.coco import COCO as _COCO -from pycocotools.cocoeval import COCOeval as _COCOeval - - -class COCO(_COCO): - """This class is almost the same as official pycocotools package. - - It implements some snake case function aliases. So that the COCO class has - the same interface as LVIS class. - """ - - def __init__(self, annotation_file=None): - if getattr(pycocotools, '__version__', '0') >= '12.0.2': - warnings.warn( - 'mmpycocotools is deprecated. Please install official pycocotools by "pip install pycocotools"', # noqa: E501 - UserWarning) - super().__init__(annotation_file=annotation_file) - self.img_ann_map = self.imgToAnns - self.cat_img_map = self.catToImgs - - def get_ann_ids(self, img_ids=[], cat_ids=[], area_rng=[], iscrowd=None): - return self.getAnnIds(img_ids, cat_ids, area_rng, iscrowd) - - def get_cat_ids(self, cat_names=[], sup_names=[], cat_ids=[]): - cat_ids_coco = self.getCatIds(cat_names, sup_names, cat_ids) - if None in cat_names: - index = [i for i, v in enumerate(cat_names) if v is not None] - cat_ids = list(range(len(cat_names))) - for i in range(len(index)): - cat_ids[index[i]] = cat_ids_coco[i] - return cat_ids - else: - return cat_ids_coco - - def get_img_ids(self, img_ids=[], cat_ids=[]): - return self.getImgIds(img_ids, cat_ids) - - def load_anns(self, ids): - return self.loadAnns(ids) - - def load_cats(self, ids): - return self.loadCats(ids) - - def load_imgs(self, ids): - return self.loadImgs(ids) - - -# just for the ease of import -COCOeval = _COCOeval - - -class COCOPanoptic(COCO): - """This wrapper is for loading the panoptic style annotation file. - - The format is shown in the CocoPanopticDataset class. - - Args: - annotation_file (str, optional): Path of annotation file. - Defaults to None. - """ - - def __init__(self, annotation_file: Optional[str] = None) -> None: - super(COCOPanoptic, self).__init__(annotation_file) - - def createIndex(self) -> None: - """Create index.""" - # create index - print('creating index...') - # anns stores 'segment_id -> annotation' - anns, cats, imgs = {}, {}, {} - img_to_anns, cat_to_imgs = defaultdict(list), defaultdict(list) - if 'annotations' in self.dataset: - for ann in self.dataset['annotations']: - for seg_ann in ann['segments_info']: - # to match with instance.json - seg_ann['image_id'] = ann['image_id'] - img_to_anns[ann['image_id']].append(seg_ann) - # segment_id is not unique in coco dataset orz... - # annotations from different images but - # may have same segment_id - if seg_ann['id'] in anns.keys(): - anns[seg_ann['id']].append(seg_ann) - else: - anns[seg_ann['id']] = [seg_ann] - - # filter out annotations from other images - img_to_anns_ = defaultdict(list) - for k, v in img_to_anns.items(): - img_to_anns_[k] = [x for x in v if x['image_id'] == k] - img_to_anns = img_to_anns_ - - if 'images' in self.dataset: - for img_info in self.dataset['images']: - img_info['segm_file'] = img_info['file_name'].replace( - 'jpg', 'png') - imgs[img_info['id']] = img_info - - if 'categories' in self.dataset: - for cat in self.dataset['categories']: - cats[cat['id']] = cat - - if 'annotations' in self.dataset and 'categories' in self.dataset: - for ann in self.dataset['annotations']: - for seg_ann in ann['segments_info']: - cat_to_imgs[seg_ann['category_id']].append(ann['image_id']) - - print('index created!') - - self.anns = anns - self.imgToAnns = img_to_anns - self.catToImgs = cat_to_imgs - self.imgs = imgs - self.cats = cats - - def load_anns(self, - ids: Union[List[int], int] = []) -> Optional[List[dict]]: - """Load anns with the specified ids. - - ``self.anns`` is a list of annotation lists instead of a - list of annotations. - - Args: - ids (Union[List[int], int]): Integer ids specifying anns. - - Returns: - anns (List[dict], optional): Loaded ann objects. - """ - anns = [] - - if hasattr(ids, '__iter__') and hasattr(ids, '__len__'): - # self.anns is a list of annotation lists instead of - # a list of annotations - for id in ids: - anns += self.anns[id] - return anns - elif type(ids) == int: - return self.anns[ids] diff --git a/projects/EfficientDet/efficientdet/tensorflow/coco_90class.py b/projects/EfficientDet/efficientdet/tensorflow/coco_90class.py deleted file mode 100644 index d2996ccb8..000000000 --- a/projects/EfficientDet/efficientdet/tensorflow/coco_90class.py +++ /dev/null @@ -1,198 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import copy -import os.path as osp -from typing import List, Union - -from mmengine.fileio import get_local_path - -from mmdet.datasets.base_det_dataset import BaseDetDataset -from mmdet.registry import DATASETS -from .api_wrappers import COCO - - -@DATASETS.register_module() -class Coco90Dataset(BaseDetDataset): - """Dataset for COCO.""" - - METAINFO = { - 'classes': - ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', - 'truck', 'boat', 'traffic light', 'fire hydrant', None, 'stop sign', - 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', - 'cow', 'elephant', 'bear', 'zebra', 'giraffe', None, 'backpack', - 'umbrella', None, None, 'handbag', 'tie', 'suitcase', 'frisbee', - 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', - 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', - 'bottle', None, 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', - 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', - 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', - 'bed', None, 'dining table', None, None, 'toilet', None, 'tv', - 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', - 'oven', 'toaster', 'sink', 'refrigerator', None, 'book', 'clock', - 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'), - # palette is a list of color tuples, which is used for visualization. - 'palette': - [(220, 20, 60), (119, 11, 32), (0, 0, 142), (0, 0, 230), (106, 0, 228), - (0, 60, 100), (0, 80, 100), (0, 0, 70), (0, 0, 192), (250, 170, 30), - (100, 170, 30), None, (220, 220, 0), (175, 116, 175), (250, 0, 30), - (165, 42, 42), (255, 77, 255), (0, 226, 252), (182, 182, 255), - (0, 82, 0), (120, 166, 157), (110, 76, 0), (174, 57, 255), - (199, 100, 0), (72, 0, 118), None, - (255, 179, 240), (0, 125, 92), None, None, (209, 0, 151), - (188, 208, 182), (0, 220, 176), (255, 99, 164), (92, 0, 73), - (133, 129, 255), (78, 180, 255), (0, 228, 0), (174, 255, 243), - (45, 89, 255), (134, 134, 103), (145, 148, 174), (255, 208, 186), - (197, 226, 255), None, (171, 134, 1), (109, 63, 54), (207, 138, 255), - (151, 0, 95), (9, 80, 61), (84, 105, 51), (74, 65, 105), - (166, 196, 102), (208, 195, 210), (255, 109, 65), (0, 143, 149), - (179, 0, 194), (209, 99, 106), (5, 121, 0), (227, 255, 205), - (147, 186, 208), (153, 69, 1), (3, 95, 161), (163, 255, 0), - (119, 0, 170), None, (0, 182, 199), None, None, (0, 165, 120), None, - (183, 130, 88), (95, 32, 0), (130, 114, 135), (110, 129, 133), - (166, 74, 118), (219, 142, 185), (79, 210, 114), (178, 90, 62), - (65, 70, 15), (127, 167, 115), (59, 105, 106), None, (142, 108, 45), - (196, 172, 0), (95, 54, 80), (128, 76, 255), (201, 57, 1), - (246, 0, 122), (191, 162, 208)] - } - COCOAPI = COCO - # ann_id is unique in coco dataset. - ANN_ID_UNIQUE = True - - def load_data_list(self) -> List[dict]: - """Load annotations from an annotation file named as ``self.ann_file`` - - Returns: - List[dict]: A list of annotation. - """ # noqa: E501 - with get_local_path( - self.ann_file, backend_args=self.backend_args) as local_path: - self.coco = self.COCOAPI(local_path) - # The order of returned `cat_ids` will not - # change with the order of the `classes` - self.cat_ids = self.coco.get_cat_ids( - cat_names=self.metainfo['classes']) - self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)} - self.cat_img_map = copy.deepcopy(self.coco.cat_img_map) - - img_ids = self.coco.get_img_ids() - data_list = [] - total_ann_ids = [] - for img_id in img_ids: - raw_img_info = self.coco.load_imgs([img_id])[0] - raw_img_info['img_id'] = img_id - - ann_ids = self.coco.get_ann_ids(img_ids=[img_id]) - raw_ann_info = self.coco.load_anns(ann_ids) - total_ann_ids.extend(ann_ids) - - parsed_data_info = self.parse_data_info({ - 'raw_ann_info': - raw_ann_info, - 'raw_img_info': - raw_img_info - }) - data_list.append(parsed_data_info) - if self.ANN_ID_UNIQUE: - assert len(set(total_ann_ids)) == len( - total_ann_ids - ), f"Annotation ids in '{self.ann_file}' are not unique!" - - del self.coco - - return data_list - - def parse_data_info(self, raw_data_info: dict) -> Union[dict, List[dict]]: - """Parse raw annotation to target format. - - Args: - raw_data_info (dict): Raw data information load from ``ann_file`` - - Returns: - Union[dict, List[dict]]: Parsed annotation. - """ - img_info = raw_data_info['raw_img_info'] - ann_info = raw_data_info['raw_ann_info'] - - data_info = {} - - # TODO: need to change data_prefix['img'] to data_prefix['img_path'] - img_path = osp.join(self.data_prefix['img'], img_info['file_name']) - if self.data_prefix.get('seg', None): - seg_map_path = osp.join( - self.data_prefix['seg'], - img_info['file_name'].rsplit('.', 1)[0] + self.seg_map_suffix) - else: - seg_map_path = None - data_info['img_path'] = img_path - data_info['img_id'] = img_info['img_id'] - data_info['seg_map_path'] = seg_map_path - data_info['height'] = img_info['height'] - data_info['width'] = img_info['width'] - - instances = [] - for i, ann in enumerate(ann_info): - instance = {} - - if ann.get('ignore', False): - continue - x1, y1, w, h = ann['bbox'] - inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0)) - inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0)) - if inter_w * inter_h == 0: - continue - if ann['area'] <= 0 or w < 1 or h < 1: - continue - if ann['category_id'] not in self.cat_ids: - continue - bbox = [x1, y1, x1 + w, y1 + h] - - if ann.get('iscrowd', False): - instance['ignore_flag'] = 1 - else: - instance['ignore_flag'] = 0 - instance['bbox'] = bbox - instance['bbox_label'] = self.cat2label[ann['category_id']] - - if ann.get('segmentation', None): - instance['mask'] = ann['segmentation'] - - instances.append(instance) - data_info['instances'] = instances - return data_info - - def filter_data(self) -> List[dict]: - """Filter annotations according to filter_cfg. - - Returns: - List[dict]: Filtered results. - """ - if self.test_mode: - return self.data_list - - if self.filter_cfg is None: - return self.data_list - - filter_empty_gt = self.filter_cfg.get('filter_empty_gt', False) - min_size = self.filter_cfg.get('min_size', 0) - - # obtain images that contain annotation - ids_with_ann = set(data_info['img_id'] for data_info in self.data_list) - # obtain images that contain annotations of the required categories - ids_in_cat = set() - for i, class_id in enumerate(self.cat_ids): - ids_in_cat |= set(self.cat_img_map[class_id]) - # merge the image id sets of the two conditions and use the merged set - # to filter out images if self.filter_empty_gt=True - ids_in_cat &= ids_with_ann - - valid_data_infos = [] - for i, data_info in enumerate(self.data_list): - img_id = data_info['img_id'] - width = data_info['width'] - height = data_info['height'] - if filter_empty_gt and img_id not in ids_in_cat: - continue - if min(width, height) >= min_size: - valid_data_infos.append(data_info) - - return valid_data_infos diff --git a/projects/EfficientDet/efficientdet/tensorflow/coco_90metric.py b/projects/EfficientDet/efficientdet/tensorflow/coco_90metric.py deleted file mode 100644 index eed652240..000000000 --- a/projects/EfficientDet/efficientdet/tensorflow/coco_90metric.py +++ /dev/null @@ -1,540 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import datetime -import itertools -import os.path as osp -import tempfile -from collections import OrderedDict -from typing import Dict, List, Optional, Sequence, Union - -import numpy as np -from mmengine.evaluator import BaseMetric -from mmengine.fileio import dump, get_local_path, load -from mmengine.logging import MMLogger -from terminaltables import AsciiTable - -from mmdet.evaluation.functional import eval_recalls -from mmdet.registry import METRICS -from mmdet.structures.mask import encode_mask_results -from .api_wrappers import COCO, COCOeval - - -@METRICS.register_module() -class Coco90Metric(BaseMetric): - """COCO evaluation metric. - - Evaluate AR, AP, and mAP for detection tasks including proposal/box - detection and instance segmentation. Please refer to - https://cocodataset.org/#detection-eval for more details. - - Args: - ann_file (str, optional): Path to the coco format annotation file. - If not specified, ground truth annotations from the dataset will - be converted to coco format. Defaults to None. - metric (str | List[str]): Metrics to be evaluated. Valid metrics - include 'bbox', 'segm', 'proposal', and 'proposal_fast'. - Defaults to 'bbox'. - classwise (bool): Whether to evaluate the metric class-wise. - Defaults to False. - proposal_nums (Sequence[int]): Numbers of proposals to be evaluated. - Defaults to (100, 300, 1000). - iou_thrs (float | List[float], optional): IoU threshold to compute AP - and AR. If not specified, IoUs from 0.5 to 0.95 will be used. - Defaults to None. - metric_items (List[str], optional): Metric result names to be - recorded in the evaluation result. Defaults to None. - format_only (bool): Format the output results without perform - evaluation. It is useful when you want to format the result - to a specific format and submit it to the test server. - Defaults to False. - outfile_prefix (str, optional): The prefix of json files. It includes - the file path and the prefix of filename, e.g., "a/b/prefix". - If not specified, a temp file will be created. Defaults to None. - backend_args (dict, optional): Arguments to instantiate the - corresponding backend. Defaults to None. - collect_device (str): Device name used for collecting results from - different ranks during distributed training. Must be 'cpu' or - 'gpu'. Defaults to 'cpu'. - prefix (str, optional): The prefix that will be added in the metric - names to disambiguate homonymous metrics of different evaluators. - If prefix is not provided in the argument, self.default_prefix - will be used instead. Defaults to None. - """ - default_prefix: Optional[str] = 'coco' - - def __init__(self, - ann_file: Optional[str] = None, - metric: Union[str, List[str]] = 'bbox', - classwise: bool = False, - proposal_nums: Sequence[int] = (100, 300, 1000), - iou_thrs: Optional[Union[float, Sequence[float]]] = None, - metric_items: Optional[Sequence[str]] = None, - format_only: bool = False, - outfile_prefix: Optional[str] = None, - backend_args: dict = None, - collect_device: str = 'cpu', - prefix: Optional[str] = None) -> None: - super().__init__(collect_device=collect_device, prefix=prefix) - # coco evaluation metrics - self.metrics = metric if isinstance(metric, list) else [metric] - allowed_metrics = ['bbox', 'segm', 'proposal', 'proposal_fast'] - for metric in self.metrics: - if metric not in allowed_metrics: - raise KeyError( - "metric should be one of 'bbox', 'segm', 'proposal', " - f"'proposal_fast', but got {metric}.") - - # do class wise evaluation, default False - self.classwise = classwise - - # proposal_nums used to compute recall or precision. - self.proposal_nums = list(proposal_nums) - - # iou_thrs used to compute recall or precision. - if iou_thrs is None: - iou_thrs = np.linspace( - .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True) - self.iou_thrs = iou_thrs - self.metric_items = metric_items - self.format_only = format_only - if self.format_only: - assert outfile_prefix is not None, 'outfile_prefix must be not' - 'None when format_only is True, otherwise the result files will' - 'be saved to a temp directory which will be cleaned up at the end.' - - self.outfile_prefix = outfile_prefix - - self.backend_args = backend_args - - # if ann_file is not specified, - # initialize coco api with the converted dataset - if ann_file is not None: - with get_local_path( - ann_file, backend_args=self.backend_args) as local_path: - self._coco_api = COCO(local_path) - else: - self._coco_api = None - - # handle dataset lazy init - self.cat_ids = None - self.img_ids = None - - def fast_eval_recall(self, - results: List[dict], - proposal_nums: Sequence[int], - iou_thrs: Sequence[float], - logger: Optional[MMLogger] = None) -> np.ndarray: - """Evaluate proposal recall with COCO's fast_eval_recall. - - Args: - results (List[dict]): Results of the dataset. - proposal_nums (Sequence[int]): Proposal numbers used for - evaluation. - iou_thrs (Sequence[float]): IoU thresholds used for evaluation. - logger (MMLogger, optional): Logger used for logging the recall - summary. - Returns: - np.ndarray: Averaged recall results. - """ - gt_bboxes = [] - pred_bboxes = [result['bboxes'] for result in results] - for i in range(len(self.img_ids)): - ann_ids = self._coco_api.get_ann_ids(img_ids=self.img_ids[i]) - ann_info = self._coco_api.load_anns(ann_ids) - if len(ann_info) == 0: - gt_bboxes.append(np.zeros((0, 4))) - continue - bboxes = [] - for ann in ann_info: - if ann.get('ignore', False) or ann['iscrowd']: - continue - x1, y1, w, h = ann['bbox'] - bboxes.append([x1, y1, x1 + w, y1 + h]) - bboxes = np.array(bboxes, dtype=np.float32) - if bboxes.shape[0] == 0: - bboxes = np.zeros((0, 4)) - gt_bboxes.append(bboxes) - - recalls = eval_recalls( - gt_bboxes, pred_bboxes, proposal_nums, iou_thrs, logger=logger) - ar = recalls.mean(axis=1) - return ar - - def xyxy2xywh(self, bbox: np.ndarray) -> list: - """Convert ``xyxy`` style bounding boxes to ``xywh`` style for COCO - evaluation. - - Args: - bbox (numpy.ndarray): The bounding boxes, shape (4, ), in - ``xyxy`` order. - - Returns: - list[float]: The converted bounding boxes, in ``xywh`` order. - """ - - _bbox: List = bbox.tolist() - return [ - _bbox[0], - _bbox[1], - _bbox[2] - _bbox[0], - _bbox[3] - _bbox[1], - ] - - def results2json(self, results: Sequence[dict], - outfile_prefix: str) -> dict: - """Dump the detection results to a COCO style json file. - - There are 3 types of results: proposals, bbox predictions, mask - predictions, and they have different data types. This method will - automatically recognize the type, and dump them to json files. - - Args: - results (Sequence[dict]): Testing results of the - dataset. - outfile_prefix (str): The filename prefix of the json files. If the - prefix is "somepath/xxx", the json files will be named - "somepath/xxx.bbox.json", "somepath/xxx.segm.json", - "somepath/xxx.proposal.json". - - Returns: - dict: Possible keys are "bbox", "segm", "proposal", and - values are corresponding filenames. - """ - bbox_json_results = [] - segm_json_results = [] if 'masks' in results[0] else None - for idx, result in enumerate(results): - image_id = result.get('img_id', idx) - labels = result['labels'] - bboxes = result['bboxes'] - scores = result['scores'] - # bbox results - for i, label in enumerate(labels): - data = dict() - data['image_id'] = image_id - data['bbox'] = self.xyxy2xywh(bboxes[i]) - data['score'] = float(scores[i]) - data['category_id'] = self.cat_ids[label] - bbox_json_results.append(data) - - if segm_json_results is None: - continue - - # segm results - masks = result['masks'] - mask_scores = result.get('mask_scores', scores) - for i, label in enumerate(labels): - data = dict() - data['image_id'] = image_id - data['bbox'] = self.xyxy2xywh(bboxes[i]) - data['score'] = float(mask_scores[i]) - data['category_id'] = self.cat_ids[label] - if isinstance(masks[i]['counts'], bytes): - masks[i]['counts'] = masks[i]['counts'].decode() - data['segmentation'] = masks[i] - segm_json_results.append(data) - - result_files = dict() - result_files['bbox'] = f'{outfile_prefix}.bbox.json' - result_files['proposal'] = f'{outfile_prefix}.bbox.json' - dump(bbox_json_results, result_files['bbox']) - - if segm_json_results is not None: - result_files['segm'] = f'{outfile_prefix}.segm.json' - dump(segm_json_results, result_files['segm']) - - return result_files - - def gt_to_coco_json(self, gt_dicts: Sequence[dict], - outfile_prefix: str) -> str: - """Convert ground truth to coco format json file. - - Args: - gt_dicts (Sequence[dict]): Ground truth of the dataset. - outfile_prefix (str): The filename prefix of the json files. If the - prefix is "somepath/xxx", the json file will be named - "somepath/xxx.gt.json". - Returns: - str: The filename of the json file. - """ - categories = [ - dict(id=id, name=name) - for id, name in enumerate(self.dataset_meta['classes']) - ] - image_infos = [] - annotations = [] - - for idx, gt_dict in enumerate(gt_dicts): - img_id = gt_dict.get('img_id', idx) - image_info = dict( - id=img_id, - width=gt_dict['width'], - height=gt_dict['height'], - file_name='') - image_infos.append(image_info) - for ann in gt_dict['anns']: - label = ann['bbox_label'] - bbox = ann['bbox'] - coco_bbox = [ - bbox[0], - bbox[1], - bbox[2] - bbox[0], - bbox[3] - bbox[1], - ] - - annotation = dict( - id=len(annotations) + - 1, # coco api requires id starts with 1 - image_id=img_id, - bbox=coco_bbox, - iscrowd=ann.get('ignore_flag', 0), - category_id=int(label), - area=coco_bbox[2] * coco_bbox[3]) - if ann.get('mask', None): - mask = ann['mask'] - # area = mask_util.area(mask) - if isinstance(mask, dict) and isinstance( - mask['counts'], bytes): - mask['counts'] = mask['counts'].decode() - annotation['segmentation'] = mask - # annotation['area'] = float(area) - annotations.append(annotation) - - info = dict( - date_created=str(datetime.datetime.now()), - description='Coco json file converted by mmdet CocoMetric.') - coco_json = dict( - info=info, - images=image_infos, - categories=categories, - licenses=None, - ) - if len(annotations) > 0: - coco_json['annotations'] = annotations - converted_json_path = f'{outfile_prefix}.gt.json' - dump(coco_json, converted_json_path) - return converted_json_path - - # TODO: data_batch is no longer needed, consider adjusting the - # parameter position - def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None: - """Process one batch of data samples and predictions. The processed - results should be stored in ``self.results``, which will be used to - compute the metrics when all batches have been processed. - - Args: - data_batch (dict): A batch of data from the dataloader. - data_samples (Sequence[dict]): A batch of data samples that - contain annotations and predictions. - """ - for data_sample in data_samples: - result = dict() - pred = data_sample['pred_instances'] - result['img_id'] = data_sample['img_id'] - result['bboxes'] = pred['bboxes'].cpu().numpy() - result['scores'] = pred['scores'].cpu().numpy() - result['labels'] = pred['labels'].cpu().numpy() - # encode mask to RLE - if 'masks' in pred: - result['masks'] = encode_mask_results( - pred['masks'].detach().cpu().numpy()) - # some detectors use different scores for bbox and mask - if 'mask_scores' in pred: - result['mask_scores'] = pred['mask_scores'].cpu().numpy() - - # parse gt - gt = dict() - gt['width'] = data_sample['ori_shape'][1] - gt['height'] = data_sample['ori_shape'][0] - gt['img_id'] = data_sample['img_id'] - if self._coco_api is None: - # TODO: Need to refactor to support LoadAnnotations - assert 'instances' in data_sample, \ - 'ground truth is required for evaluation when ' \ - '`ann_file` is not provided' - gt['anns'] = data_sample['instances'] - # add converted result to the results list - self.results.append((gt, result)) - - def compute_metrics(self, results: list) -> Dict[str, float]: - """Compute the metrics from processed results. - - Args: - results (list): The processed results of each batch. - - Returns: - Dict[str, float]: The computed metrics. The keys are the names of - the metrics, and the values are corresponding results. - """ - logger: MMLogger = MMLogger.get_current_instance() - - # split gt and prediction list - gts, preds = zip(*results) - - tmp_dir = None - if self.outfile_prefix is None: - tmp_dir = tempfile.TemporaryDirectory() - outfile_prefix = osp.join(tmp_dir.name, 'results') - else: - outfile_prefix = self.outfile_prefix - - if self._coco_api is None: - # use converted gt json file to initialize coco api - logger.info('Converting ground truth to coco format...') - coco_json_path = self.gt_to_coco_json( - gt_dicts=gts, outfile_prefix=outfile_prefix) - self._coco_api = COCO(coco_json_path) - - # handle lazy init - if self.cat_ids is None: - self.cat_ids = self._coco_api.get_cat_ids( - cat_names=self.dataset_meta['classes']) - if self.img_ids is None: - self.img_ids = self._coco_api.get_img_ids() - - # convert predictions to coco format and dump to json file - result_files = self.results2json(preds, outfile_prefix) - - eval_results = OrderedDict() - if self.format_only: - logger.info('results are saved in ' - f'{osp.dirname(outfile_prefix)}') - return eval_results - - for metric in self.metrics: - logger.info(f'Evaluating {metric}...') - - # TODO: May refactor fast_eval_recall to an independent metric? - # fast eval recall - if metric == 'proposal_fast': - ar = self.fast_eval_recall( - preds, self.proposal_nums, self.iou_thrs, logger=logger) - log_msg = [] - for i, num in enumerate(self.proposal_nums): - eval_results[f'AR@{num}'] = ar[i] - log_msg.append(f'\nAR@{num}\t{ar[i]:.4f}') - log_msg = ''.join(log_msg) - logger.info(log_msg) - continue - - # evaluate proposal, bbox and segm - iou_type = 'bbox' if metric == 'proposal' else metric - if metric not in result_files: - raise KeyError(f'{metric} is not in results') - try: - predictions = load(result_files[metric]) - if iou_type == 'segm': - # Refer to https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/coco.py#L331 # noqa - # When evaluating mask AP, if the results contain bbox, - # cocoapi will use the box area instead of the mask area - # for calculating the instance area. Though the overall AP - # is not affected, this leads to different - # small/medium/large mask AP results. - for x in predictions: - x.pop('bbox') - coco_dt = self._coco_api.loadRes(predictions) - - except IndexError: - logger.error( - 'The testing results of the whole dataset is empty.') - break - - coco_eval = COCOeval(self._coco_api, coco_dt, iou_type) - - coco_eval.params.catIds = self.cat_ids - coco_eval.params.imgIds = self.img_ids - coco_eval.params.maxDets = list(self.proposal_nums) - coco_eval.params.iouThrs = self.iou_thrs - - # mapping of cocoEval.stats - coco_metric_names = { - 'mAP': 0, - 'mAP_50': 1, - 'mAP_75': 2, - 'mAP_s': 3, - 'mAP_m': 4, - 'mAP_l': 5, - 'AR@100': 6, - 'AR@300': 7, - 'AR@1000': 8, - 'AR_s@1000': 9, - 'AR_m@1000': 10, - 'AR_l@1000': 11 - } - metric_items = self.metric_items - if metric_items is not None: - for metric_item in metric_items: - if metric_item not in coco_metric_names: - raise KeyError( - f'metric item "{metric_item}" is not supported') - - if metric == 'proposal': - coco_eval.params.useCats = 0 - coco_eval.evaluate() - coco_eval.accumulate() - coco_eval.summarize() - if metric_items is None: - metric_items = [ - 'AR@100', 'AR@300', 'AR@1000', 'AR_s@1000', - 'AR_m@1000', 'AR_l@1000' - ] - - for item in metric_items: - val = float( - f'{coco_eval.stats[coco_metric_names[item]]:.3f}') - eval_results[item] = val - else: - coco_eval.evaluate() - coco_eval.accumulate() - coco_eval.summarize() - if self.classwise: # Compute per-category AP - # Compute per-category AP - # from https://github.com/facebookresearch/detectron2/ - precisions = coco_eval.eval['precision'] - # precision: (iou, recall, cls, area range, max dets) - assert len(self.cat_ids) == precisions.shape[2] - - results_per_category = [] - for idx, cat_id in enumerate(self.cat_ids): - # area range index 0: all area ranges - # max dets index -1: typically 100 per image - nm = self._coco_api.loadCats(cat_id)[0] - precision = precisions[:, :, idx, 0, -1] - precision = precision[precision > -1] - if precision.size: - ap = np.mean(precision) - else: - ap = float('nan') - results_per_category.append( - (f'{nm["name"]}', f'{round(ap, 3)}')) - eval_results[f'{nm["name"]}_precision'] = round(ap, 3) - - num_columns = min(6, len(results_per_category) * 2) - results_flatten = list( - itertools.chain(*results_per_category)) - headers = ['category', 'AP'] * (num_columns // 2) - results_2d = itertools.zip_longest(*[ - results_flatten[i::num_columns] - for i in range(num_columns) - ]) - table_data = [headers] - table_data += [result for result in results_2d] - table = AsciiTable(table_data) - logger.info('\n' + table.table) - - if metric_items is None: - metric_items = [ - 'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l' - ] - - for metric_item in metric_items: - key = f'{metric}_{metric_item}' - val = coco_eval.stats[coco_metric_names[metric_item]] - eval_results[key] = float(f'{round(val, 3)}') - - ap = coco_eval.stats[:6] - logger.info(f'{metric}_mAP_copypaste: {ap[0]:.3f} ' - f'{ap[1]:.3f} {ap[2]:.3f} {ap[3]:.3f} ' - f'{ap[4]:.3f} {ap[5]:.3f}') - - if tmp_dir is not None: - tmp_dir.cleanup() - return eval_results diff --git a/projects/EfficientDet/efficientdet/tensorflow/trans_max_iou_assigner.py b/projects/EfficientDet/efficientdet/tensorflow/trans_max_iou_assigner.py deleted file mode 100644 index 10fc45b5b..000000000 --- a/projects/EfficientDet/efficientdet/tensorflow/trans_max_iou_assigner.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from typing import Optional - -import torch -from mmengine.structures import InstanceData - -from mmdet.models.task_modules.assigners.assign_result import AssignResult -from mmdet.models.task_modules.assigners.max_iou_assigner import MaxIoUAssigner -from mmdet.registry import TASK_UTILS - - -@TASK_UTILS.register_module() -class TransMaxIoUAssigner(MaxIoUAssigner): - - def assign(self, - pred_instances: InstanceData, - gt_instances: InstanceData, - gt_instances_ignore: Optional[InstanceData] = None, - **kwargs) -> AssignResult: - """Assign gt to bboxes. - - This method assign a gt bbox to every bbox (proposal/anchor), each bbox - will be assigned with -1, or a semi-positive number. -1 means negative - sample, semi-positive number is the index (0-based) of assigned gt. - The assignment is done in following steps, the order matters. - - 1. assign every bbox to the background - 2. assign proposals whose iou with all gts < neg_iou_thr to 0 - 3. for each bbox, if the iou with its nearest gt >= pos_iou_thr, - assign it to that bbox - 4. for each gt bbox, assign its nearest proposals (may be more than - one) to itself - - Args: - pred_instances (:obj:`InstanceData`): Instances of model - predictions. It includes ``priors``, and the priors can - be anchors or points, or the bboxes predicted by the - previous stage, has shape (n, 4). The bboxes predicted by - the current model or stage will be named ``bboxes``, - ``labels``, and ``scores``, the same as the ``InstanceData`` - in other places. - gt_instances (:obj:`InstanceData`): Ground truth of instance - annotations. It usually includes ``bboxes``, with shape (k, 4), - and ``labels``, with shape (k, ). - gt_instances_ignore (:obj:`InstanceData`, optional): Instances - to be ignored during training. It includes ``bboxes`` - attribute data that is ignored during training and testing. - Defaults to None. - - Returns: - :obj:`AssignResult`: The assign result. - - Example: - >>> from mmengine.structures import InstanceData - >>> self = MaxIoUAssigner(0.5, 0.5) - >>> pred_instances = InstanceData() - >>> pred_instances.priors = torch.Tensor([[0, 0, 10, 10], - ... [10, 10, 20, 20]]) - >>> gt_instances = InstanceData() - >>> gt_instances.bboxes = torch.Tensor([[0, 0, 10, 9]]) - >>> gt_instances.labels = torch.Tensor([0]) - >>> assign_result = self.assign(pred_instances, gt_instances) - >>> expected_gt_inds = torch.LongTensor([1, 0]) - >>> assert torch.all(assign_result.gt_inds == expected_gt_inds) - """ - gt_bboxes = gt_instances.bboxes - priors = pred_instances.priors - gt_labels = gt_instances.labels - if gt_instances_ignore is not None: - gt_bboxes_ignore = gt_instances_ignore.bboxes - else: - gt_bboxes_ignore = None - - assign_on_cpu = True if (self.gpu_assign_thr > 0) and ( - gt_bboxes.shape[0] > self.gpu_assign_thr) else False - # compute overlap and assign gt on CPU when number of GT is large - if assign_on_cpu: - device = priors.device - priors = priors.cpu() - gt_bboxes = gt_bboxes.cpu() - gt_labels = gt_labels.cpu() - if gt_bboxes_ignore is not None: - gt_bboxes_ignore = gt_bboxes_ignore.cpu() - - trans_priors = torch.cat([ - priors[..., 1].view(-1, 1), priors[..., 0].view(-1, 1), - priors[..., 3].view(-1, 1), priors[..., 2].view(-1, 1) - ], - dim=-1) - overlaps = self.iou_calculator(gt_bboxes, trans_priors) - - if (self.ignore_iof_thr > 0 and gt_bboxes_ignore is not None - and gt_bboxes_ignore.numel() > 0 and trans_priors.numel() > 0): - if self.ignore_wrt_candidates: - ignore_overlaps = self.iou_calculator( - trans_priors, gt_bboxes_ignore, mode='iof') - ignore_max_overlaps, _ = ignore_overlaps.max(dim=1) - else: - ignore_overlaps = self.iou_calculator( - gt_bboxes_ignore, trans_priors, mode='iof') - ignore_max_overlaps, _ = ignore_overlaps.max(dim=0) - overlaps[:, ignore_max_overlaps > self.ignore_iof_thr] = -1 - - assign_result = self.assign_wrt_overlaps(overlaps, gt_labels) - if assign_on_cpu: - assign_result.gt_inds = assign_result.gt_inds.to(device) - assign_result.max_overlaps = assign_result.max_overlaps.to(device) - if assign_result.labels is not None: - assign_result.labels = assign_result.labels.to(device) - return assign_result diff --git a/projects/EfficientDet/efficientdet/tensorflow/yxyx_bbox_coder.py b/projects/EfficientDet/efficientdet/tensorflow/yxyx_bbox_coder.py deleted file mode 100644 index 63e233002..000000000 --- a/projects/EfficientDet/efficientdet/tensorflow/yxyx_bbox_coder.py +++ /dev/null @@ -1,369 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import warnings - -import numpy as np -import torch - -from mmdet.models.task_modules.coders.delta_xywh_bbox_coder import \ - DeltaXYWHBBoxCoder -from mmdet.registry import TASK_UTILS -from mmdet.structures.bbox import HorizontalBoxes, get_box_tensor - - -@TASK_UTILS.register_module() -class YXYXDeltaXYWHBBoxCoder(DeltaXYWHBBoxCoder): - - def encode(self, bboxes, gt_bboxes): - """Get box regression transformation deltas that can be used to - transform the ``bboxes`` into the ``gt_bboxes``. - - Args: - bboxes (torch.Tensor or :obj:`BaseBoxes`): Source boxes, - e.g., object proposals. - gt_bboxes (torch.Tensor or :obj:`BaseBoxes`): Target of the - transformation, e.g., ground-truth boxes. - - Returns: - torch.Tensor: Box transformation deltas - """ - bboxes = get_box_tensor(bboxes) - gt_bboxes = get_box_tensor(gt_bboxes) - assert bboxes.size(0) == gt_bboxes.size(0) - assert bboxes.size(-1) == gt_bboxes.size(-1) == 4 - encoded_bboxes = YXbbox2delta(bboxes, gt_bboxes, self.means, self.stds) - return encoded_bboxes - - def decode(self, - bboxes, - pred_bboxes, - max_shape=None, - wh_ratio_clip=16 / 1000): - """Apply transformation `pred_bboxes` to `boxes`. - - Args: - bboxes (torch.Tensor or :obj:`BaseBoxes`): Basic boxes. Shape - (B, N, 4) or (N, 4) - pred_bboxes (Tensor): Encoded offsets with respect to each roi. - Has shape (B, N, num_classes * 4) or (B, N, 4) or - (N, num_classes * 4) or (N, 4). Note N = num_anchors * W * H - when rois is a grid of anchors.Offset encoding follows [1]_. - max_shape (Sequence[int] or torch.Tensor or Sequence[ - Sequence[int]],optional): Maximum bounds for boxes, specifies - (H, W, C) or (H, W). If bboxes shape is (B, N, 4), then - the max_shape should be a Sequence[Sequence[int]] - and the length of max_shape should also be B. - wh_ratio_clip (float, optional): The allowed ratio between - width and height. - - Returns: - Union[torch.Tensor, :obj:`BaseBoxes`]: Decoded boxes. - """ - bboxes = get_box_tensor(bboxes) - assert pred_bboxes.size(0) == bboxes.size(0) - if pred_bboxes.ndim == 3: - assert pred_bboxes.size(1) == bboxes.size(1) - - if pred_bboxes.ndim == 2 and not torch.onnx.is_in_onnx_export(): - # single image decode - decoded_bboxes = YXdelta2bbox(bboxes, pred_bboxes, self.means, - self.stds, max_shape, wh_ratio_clip, - self.clip_border, self.add_ctr_clamp, - self.ctr_clamp) - else: - if pred_bboxes.ndim == 3 and not torch.onnx.is_in_onnx_export(): - warnings.warn( - 'DeprecationWarning: onnx_delta2bbox is deprecated ' - 'in the case of batch decoding and non-ONNX, ' - 'please use “delta2bbox” instead. In order to improve ' - 'the decoding speed, the batch function will no ' - 'longer be supported. ') - decoded_bboxes = YXonnx_delta2bbox(bboxes, pred_bboxes, self.means, - self.stds, max_shape, - wh_ratio_clip, self.clip_border, - self.add_ctr_clamp, - self.ctr_clamp) - - if self.use_box_type: - assert decoded_bboxes.size(-1) == 4, \ - ('Cannot warp decoded boxes with box type when decoded boxes' - 'have shape of (N, num_classes * 4)') - decoded_bboxes = HorizontalBoxes(decoded_bboxes) - return decoded_bboxes - - -def YXdelta2bbox(rois, - deltas, - means=(0., 0., 0., 0.), - stds=(1., 1., 1., 1.), - max_shape=None, - hw_ratio_clip=1000 / 16, - clip_border=True, - add_ctr_clamp=False, - ctr_clamp=32): - """Apply deltas to shift/scale base boxes. - - Typically the rois are anchor or proposed bounding boxes and the deltas are - network outputs used to shift/scale those boxes. - This is the inverse function of :func:`bbox2delta`. - - Args: - rois (Tensor): Boxes to be transformed. Has shape (N, 4). - deltas (Tensor): Encoded offsets relative to each roi. - Has shape (N, num_classes * 4) or (N, 4). Note - N = num_base_anchors * W * H, when rois is a grid of - anchors. Offset encoding follows [1]_. - means (Sequence[float]): Denormalizing means for delta coordinates. - Default (0., 0., 0., 0.). - stds (Sequence[float]): Denormalizing standard deviation for delta - coordinates. Default (1., 1., 1., 1.). - max_shape (tuple[int, int]): Maximum bounds for boxes, specifies - (H, W). Default None. - wh_ratio_clip (float): Maximum aspect ratio for boxes. Default - 16 / 1000. - clip_border (bool, optional): Whether clip the objects outside the - border of the image. Default True. - add_ctr_clamp (bool): Whether to add center clamp. When set to True, - the center of the prediction bounding box will be clamped to - avoid being too far away from the center of the anchor. - Only used by YOLOF. Default False. - ctr_clamp (int): the maximum pixel shift to clamp. Only used by YOLOF. - Default 32. - - Returns: - Tensor: Boxes with shape (N, num_classes * 4) or (N, 4), where 4 - represent tl_x, tl_y, br_x, br_y. - - References: - .. [1] https://arxiv.org/abs/1311.2524 - - Example: - >>> rois = torch.Tensor([[ 0., 0., 1., 1.], - >>> [ 0., 0., 1., 1.], - >>> [ 0., 0., 1., 1.], - >>> [ 5., 5., 5., 5.]]) - >>> deltas = torch.Tensor([[ 0., 0., 0., 0.], - >>> [ 1., 1., 1., 1.], - >>> [ 0., 0., 2., -1.], - >>> [ 0.7, -1.9, -0.5, 0.3]]) - >>> delta2bbox(rois, deltas, max_shape=(32, 32, 3)) - tensor([[0.0000, 0.0000, 1.0000, 1.0000], - [0.1409, 0.1409, 2.8591, 2.8591], - [0.0000, 0.3161, 4.1945, 0.6839], - [5.0000, 5.0000, 5.0000, 5.0000]]) - """ - num_bboxes, num_classes = deltas.size(0), deltas.size(1) // 4 - if num_bboxes == 0: - return deltas - - deltas = deltas.reshape(-1, 4) - - means = deltas.new_tensor(means).view(1, -1) - stds = deltas.new_tensor(stds).view(1, -1) - denorm_deltas = deltas * stds + means - - dyx = denorm_deltas[:, :2] - dhw = denorm_deltas[:, 2:] - - # Compute width/height of each roi - rois_ = rois.repeat(1, num_classes).reshape(-1, 4) - pyx = ((rois_[:, :2] + rois_[:, 2:]) * 0.5) - phw = (rois_[:, 2:] - rois_[:, :2]) - - dyx_hw = phw * dyx - - max_ratio = np.abs(np.log(hw_ratio_clip)) - if add_ctr_clamp: - dyx_hw = torch.clamp(dyx_hw, max=ctr_clamp, min=-ctr_clamp) - dhw = torch.clamp(dhw, max=max_ratio) - else: - dhw = dhw.clamp(min=-max_ratio, max=max_ratio) - - gyx = pyx + dyx_hw - ghw = phw * dhw.exp() - y1x1 = gyx - (ghw * 0.5) - y2x2 = gyx + (ghw * 0.5) - ymin, xmin = y1x1[:, 0].reshape(-1, 1), y1x1[:, 1].reshape(-1, 1) - ymax, xmax = y2x2[:, 0].reshape(-1, 1), y2x2[:, 1].reshape(-1, 1) - bboxes = torch.cat([xmin, ymin, xmax, ymax], dim=-1) - if clip_border and max_shape is not None: - bboxes[..., 0::2].clamp_(min=0, max=max_shape[1]) - bboxes[..., 1::2].clamp_(min=0, max=max_shape[0]) - bboxes = bboxes.reshape(num_bboxes, -1) - return bboxes - - -def YXbbox2delta(proposals, gt, means=(0., 0., 0., 0.), stds=(1., 1., 1., 1.)): - """Compute deltas of proposals w.r.t. gt. - - We usually compute the deltas of x, y, w, h of proposals w.r.t ground - truth bboxes to get regression target. - This is the inverse function of :func:`delta2bbox`. - - Args: - proposals (Tensor): Boxes to be transformed, shape (N, ..., 4) - gt (Tensor): Gt bboxes to be used as base, shape (N, ..., 4) - means (Sequence[float]): Denormalizing means for delta coordinates - stds (Sequence[float]): Denormalizing standard deviation for delta - coordinates - - Returns: - Tensor: deltas with shape (N, 4), where columns represent dx, dy, - dw, dh. - """ - assert proposals.size() == gt.size() - - proposals = proposals.float() - gt = gt.float() - py = (proposals[..., 0] + proposals[..., 2]) * 0.5 - px = (proposals[..., 1] + proposals[..., 3]) * 0.5 - ph = proposals[..., 2] - proposals[..., 0] - pw = proposals[..., 3] - proposals[..., 1] - - gx = (gt[..., 0] + gt[..., 2]) * 0.5 - gy = (gt[..., 1] + gt[..., 3]) * 0.5 - gw = gt[..., 2] - gt[..., 0] - gh = gt[..., 3] - gt[..., 1] - - dx = (gx - px) / pw - dy = (gy - py) / ph - dw = torch.log(gw / pw) - dh = torch.log(gh / ph) - deltas = torch.stack([dy, dx, dh, dw], dim=-1) - - means = deltas.new_tensor(means).unsqueeze(0) - stds = deltas.new_tensor(stds).unsqueeze(0) - deltas = deltas.sub_(means).div_(stds) - - return deltas - - -def YXonnx_delta2bbox(rois, - deltas, - means=(0., 0., 0., 0.), - stds=(1., 1., 1., 1.), - max_shape=None, - wh_ratio_clip=16 / 1000, - clip_border=True, - add_ctr_clamp=False, - ctr_clamp=32): - """Apply deltas to shift/scale base boxes. - - Typically the rois are anchor or proposed bounding boxes and the deltas are - network outputs used to shift/scale those boxes. - This is the inverse function of :func:`bbox2delta`. - - Args: - rois (Tensor): Boxes to be transformed. Has shape (N, 4) or (B, N, 4) - deltas (Tensor): Encoded offsets with respect to each roi. - Has shape (B, N, num_classes * 4) or (B, N, 4) or - (N, num_classes * 4) or (N, 4). Note N = num_anchors * W * H - when rois is a grid of anchors.Offset encoding follows [1]_. - means (Sequence[float]): Denormalizing means for delta coordinates. - Default (0., 0., 0., 0.). - stds (Sequence[float]): Denormalizing standard deviation for delta - coordinates. Default (1., 1., 1., 1.). - max_shape (Sequence[int] or torch.Tensor or Sequence[ - Sequence[int]],optional): Maximum bounds for boxes, specifies - (H, W, C) or (H, W). If rois shape is (B, N, 4), then - the max_shape should be a Sequence[Sequence[int]] - and the length of max_shape should also be B. Default None. - wh_ratio_clip (float): Maximum aspect ratio for boxes. - Default 16 / 1000. - clip_border (bool, optional): Whether clip the objects outside the - border of the image. Default True. - add_ctr_clamp (bool): Whether to add center clamp, when added, the - predicted box is clamped is its center is too far away from - the original anchor's center. Only used by YOLOF. Default False. - ctr_clamp (int): the maximum pixel shift to clamp. Only used by YOLOF. - Default 32. - - Returns: - Tensor: Boxes with shape (B, N, num_classes * 4) or (B, N, 4) or - (N, num_classes * 4) or (N, 4), where 4 represent - tl_x, tl_y, br_x, br_y. - - References: - .. [1] https://arxiv.org/abs/1311.2524 - - Example: - >>> rois = torch.Tensor([[ 0., 0., 1., 1.], - >>> [ 0., 0., 1., 1.], - >>> [ 0., 0., 1., 1.], - >>> [ 5., 5., 5., 5.]]) - >>> deltas = torch.Tensor([[ 0., 0., 0., 0.], - >>> [ 1., 1., 1., 1.], - >>> [ 0., 0., 2., -1.], - >>> [ 0.7, -1.9, -0.5, 0.3]]) - >>> delta2bbox(rois, deltas, max_shape=(32, 32, 3)) - tensor([[0.0000, 0.0000, 1.0000, 1.0000], - [0.1409, 0.1409, 2.8591, 2.8591], - [0.0000, 0.3161, 4.1945, 0.6839], - [5.0000, 5.0000, 5.0000, 5.0000]]) - """ - means = deltas.new_tensor(means).view(1, - -1).repeat(1, - deltas.size(-1) // 4) - stds = deltas.new_tensor(stds).view(1, -1).repeat(1, deltas.size(-1) // 4) - denorm_deltas = deltas * stds + means - dy = denorm_deltas[..., 0::4] - dx = denorm_deltas[..., 1::4] - dh = denorm_deltas[..., 2::4] - dw = denorm_deltas[..., 3::4] - - y1, x1 = rois[..., 0], rois[..., 1] - y2, x2 = rois[..., 2], rois[..., 3] - # Compute center of each roi - px = ((x1 + x2) * 0.5).unsqueeze(-1).expand_as(dx) - py = ((y1 + y2) * 0.5).unsqueeze(-1).expand_as(dy) - # Compute width/height of each roi - pw = (x2 - x1).unsqueeze(-1).expand_as(dw) - ph = (y2 - y1).unsqueeze(-1).expand_as(dh) - - dx_width = pw * dx - dy_height = ph * dy - - max_ratio = np.abs(np.log(wh_ratio_clip)) - if add_ctr_clamp: - dx_width = torch.clamp(dx_width, max=ctr_clamp, min=-ctr_clamp) - dy_height = torch.clamp(dy_height, max=ctr_clamp, min=-ctr_clamp) - dw = torch.clamp(dw, max=max_ratio) - dh = torch.clamp(dh, max=max_ratio) - else: - dw = dw.clamp(min=-max_ratio, max=max_ratio) - dh = dh.clamp(min=-max_ratio, max=max_ratio) - # Use exp(network energy) to enlarge/shrink each roi - gw = pw * dw.exp() - gh = ph * dh.exp() - # Use network energy to shift the center of each roi - gx = px + dx_width - gy = py + dy_height - # Convert center-xy/width/height to top-left, bottom-right - x1 = gx - gw * 0.5 - y1 = gy - gh * 0.5 - x2 = gx + gw * 0.5 - y2 = gy + gh * 0.5 - - bboxes = torch.stack([x1, y1, x2, y2], dim=-1).view(deltas.size()) - - if clip_border and max_shape is not None: - # clip bboxes with dynamic `min` and `max` for onnx - if torch.onnx.is_in_onnx_export(): - from mmdet.core.export import dynamic_clip_for_onnx - x1, y1, x2, y2 = dynamic_clip_for_onnx(x1, y1, x2, y2, max_shape) - bboxes = torch.stack([x1, y1, x2, y2], dim=-1).view(deltas.size()) - return bboxes - if not isinstance(max_shape, torch.Tensor): - max_shape = x1.new_tensor(max_shape) - max_shape = max_shape[..., :2].type_as(x1) - if max_shape.ndim == 2: - assert bboxes.ndim == 3 - assert max_shape.size(0) == bboxes.size(0) - - min_xy = x1.new_tensor(0) - max_xy = torch.cat( - [max_shape] * (deltas.size(-1) // 2), - dim=-1).flip(-1).unsqueeze(-2) - bboxes = torch.where(bboxes < min_xy, min_xy, bboxes) - bboxes = torch.where(bboxes > max_xy, max_xy, bboxes) - - return bboxes diff --git a/projects/EfficientDet/efficientdet/utils.py b/projects/EfficientDet/efficientdet/utils.py deleted file mode 100644 index 9c30a01fc..000000000 --- a/projects/EfficientDet/efficientdet/utils.py +++ /dev/null @@ -1,154 +0,0 @@ -import math -from typing import Tuple, Union - -import torch -import torch.nn as nn -from mmcv.cnn.bricks import Swish, build_norm_layer -from torch.nn import functional as F -from torch.nn.init import _calculate_fan_in_and_fan_out, trunc_normal_ - -from mmdet.registry import MODELS -from mmdet.utils import OptConfigType - - -def variance_scaling_trunc(tensor, gain=1.): - fan_in, _ = _calculate_fan_in_and_fan_out(tensor) - gain /= max(1.0, fan_in) - std = math.sqrt(gain) / .87962566103423978 - return trunc_normal_(tensor, 0., std) - - -@MODELS.register_module() -class Conv2dSamePadding(nn.Conv2d): - - def __init__(self, - in_channels: int, - out_channels: int, - kernel_size: Union[int, Tuple[int, int]], - stride: Union[int, Tuple[int, int]] = 1, - padding: Union[int, Tuple[int, int]] = 0, - dilation: Union[int, Tuple[int, int]] = 1, - groups: int = 1, - bias: bool = True): - super().__init__(in_channels, out_channels, kernel_size, stride, 0, - dilation, groups, bias) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - img_h, img_w = x.size()[-2:] - kernel_h, kernel_w = self.weight.size()[-2:] - extra_w = (math.ceil(img_w / self.stride[1]) - - 1) * self.stride[1] - img_w + kernel_w - extra_h = (math.ceil(img_h / self.stride[0]) - - 1) * self.stride[0] - img_h + kernel_h - - left = extra_w // 2 - right = extra_w - left - top = extra_h // 2 - bottom = extra_h - top - x = F.pad(x, [left, right, top, bottom]) - return F.conv2d(x, self.weight, self.bias, self.stride, self.padding, - self.dilation, self.groups) - - -class MaxPool2dSamePadding(nn.Module): - - def __init__(self, - kernel_size: Union[int, Tuple[int, int]] = 3, - stride: Union[int, Tuple[int, int]] = 2, - **kwargs): - super().__init__() - self.pool = nn.MaxPool2d(kernel_size, stride, **kwargs) - self.stride = self.pool.stride - self.kernel_size = self.pool.kernel_size - - if isinstance(self.stride, int): - self.stride = [self.stride] * 2 - if isinstance(self.kernel_size, int): - self.kernel_size = [self.kernel_size] * 2 - - def forward(self, x): - h, w = x.shape[-2:] - - extra_h = (math.ceil(w / self.stride[1]) - - 1) * self.stride[1] - w + self.kernel_size[1] - extra_v = (math.ceil(h / self.stride[0]) - - 1) * self.stride[0] - h + self.kernel_size[0] - - left = extra_h // 2 - right = extra_h - left - top = extra_v // 2 - bottom = extra_v - top - - x = F.pad(x, [left, right, top, bottom]) - x = self.pool(x) - - return x - - -class DepthWiseConvBlock(nn.Module): - - def __init__( - self, - in_channels: int, - out_channels: int, - apply_norm: bool = True, - conv_bn_act_pattern: bool = False, - norm_cfg: OptConfigType = dict(type='BN', momentum=1e-2, eps=1e-3) - ) -> None: - super(DepthWiseConvBlock, self).__init__() - self.depthwise_conv = Conv2dSamePadding( - in_channels, - in_channels, - kernel_size=3, - stride=1, - groups=in_channels, - bias=False) - self.pointwise_conv = Conv2dSamePadding( - in_channels, out_channels, kernel_size=1, stride=1) - - self.apply_norm = apply_norm - if self.apply_norm: - self.bn = build_norm_layer(norm_cfg, num_features=out_channels)[1] - - self.apply_activation = conv_bn_act_pattern - if self.apply_activation: - self.swish = Swish() - - def forward(self, x): - x = self.depthwise_conv(x) - x = self.pointwise_conv(x) - if self.apply_norm: - x = self.bn(x) - if self.apply_activation: - x = self.swish(x) - - return x - - -class DownChannelBlock(nn.Module): - - def __init__( - self, - in_channels: int, - out_channels: int, - apply_norm: bool = True, - conv_bn_act_pattern: bool = False, - norm_cfg: OptConfigType = dict(type='BN', momentum=1e-2, eps=1e-3) - ) -> None: - super(DownChannelBlock, self).__init__() - self.down_conv = Conv2dSamePadding(in_channels, out_channels, 1) - self.apply_norm = apply_norm - if self.apply_norm: - self.bn = build_norm_layer(norm_cfg, num_features=out_channels)[1] - self.apply_activation = conv_bn_act_pattern - if self.apply_activation: - self.swish = Swish() - - def forward(self, x): - x = self.down_conv(x) - if self.apply_norm: - x = self.bn(x) - if self.apply_activation: - x = self.swish(x) - - return x diff --git a/projects/HDINO/README.md b/projects/HDINO/README.md deleted file mode 100644 index 078ca4293..000000000 --- a/projects/HDINO/README.md +++ /dev/null @@ -1,35 +0,0 @@ -# H-DETR - -> [DETRs with Hybrid Matching](https://arxiv.org/abs/2207.13080) - - - -## Abstract - -One-to-one set matching is a key design for DETR to establish its end-to-end capability, so that object detection does not require a hand-crafted NMS (non-maximum suppression) to remove duplicate detections. This end-to-end signature is important for the versatility of DETR, and it has been generalized to broader vision tasks. However, we note that there are few queries assigned as positive samples and the one-to-one set matching significantly reduces the training efficacy of positive samples. We propose a simple yet effective method based on a hybrid matching scheme that combines the original one-to-one matching branch with an auxiliary one-to-many matching branch during training. Our hybrid strategy has been shown to significantly improve accuracy. In inference, only the original one-to-one match branch is used, thus maintaining the end-to-end merit and the same inference efficiency of DETR. The method is named H-DETR, and it shows that a wide range of representative DETR methods can be consistently improved across a wide range of visual tasks, including DeformableDETR, PETRv2, PETR, and TransTrack, among others. - -
- -
- -## Results and Models - -| Backbone | Model | Lr schd | box AP | Config | Download | -| :------: | :-----------: | :-----: | :----: | :--------------------------------------------: | :------: | -| R-50 | H-DINO-4scale | 12e | 48.0 | [config](./h-dino-4scale_r50_8xb2-12e_coco.py) | | - -### NOTE - -1. We are based on `DINO` rather than `Deformable DETR` to support the `Hybrid Matching` algorithm. -2. We found that directly applying Hybrid Matching to the DINO algorithm results in a significant decrease in performance. If you have any other insights or suggestions, please feel free to comment or submit a pull request (PR). - -## Citation - -```latex -@article{jia2022detrs, - title={DETRs with Hybrid Matching}, - author={Jia, Ding and Yuan, Yuhui and He, Haodi and Wu, Xiaopei and Yu, Haojun and Lin, Weihong and Sun, Lei and Zhang, Chao and Hu, Han}, - journal={arXiv preprint arXiv:2207.13080}, - year={2022} -} -``` diff --git a/projects/HDINO/__init__.py b/projects/HDINO/__init__.py deleted file mode 100644 index f8c3478b9..000000000 --- a/projects/HDINO/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .h_dino import HDINO -from .h_dino_head import HybridDINOHead - -__all__ = ['HDINO', 'HybridDINOHead'] diff --git a/projects/HDINO/h-dino-4scale_r50_8xb2-12e_coco.py b/projects/HDINO/h-dino-4scale_r50_8xb2-12e_coco.py deleted file mode 100644 index 7b16b48dc..000000000 --- a/projects/HDINO/h-dino-4scale_r50_8xb2-12e_coco.py +++ /dev/null @@ -1,168 +0,0 @@ -_base_ = [ - '../../configs/_base_/datasets/coco_detection.py', - '../../configs/_base_/default_runtime.py' -] - -custom_imports = dict(imports=['projects.HDINO'], allow_failed_imports=False) - -model = dict( - type='HDINO', - num_queries=1800, # num_total_queries: 900+900 - with_box_refine=True, - as_two_stage=True, - data_preprocessor=dict( - type='DetDataPreprocessor', - mean=[123.675, 116.28, 103.53], - std=[58.395, 57.12, 57.375], - bgr_to_rgb=True, - pad_size_divisor=1), - backbone=dict( - type='ResNet', - depth=50, - num_stages=4, - out_indices=(1, 2, 3), - frozen_stages=1, - norm_cfg=dict(type='BN', requires_grad=False), - norm_eval=True, - style='pytorch', - init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), - neck=dict( - type='ChannelMapper', - in_channels=[512, 1024, 2048], - kernel_size=1, - out_channels=256, - act_cfg=None, - norm_cfg=dict(type='GN', num_groups=32), - num_outs=4), - encoder=dict( - num_layers=6, - layer_cfg=dict( - self_attn_cfg=dict(embed_dims=256, num_levels=4, - dropout=0.0), # 0.1 for DeformDETR - ffn_cfg=dict( - embed_dims=256, - feedforward_channels=2048, # 1024 for DeformDETR - ffn_drop=0.0))), # 0.1 for DeformDETR - decoder=dict( - num_layers=6, - return_intermediate=True, - layer_cfg=dict( - self_attn_cfg=dict(embed_dims=256, num_heads=8, - dropout=0.0), # 0.1 for DeformDETR - cross_attn_cfg=dict(embed_dims=256, num_levels=4, - dropout=0.0), # 0.1 for DeformDETR - ffn_cfg=dict( - embed_dims=256, - feedforward_channels=2048, # 1024 for DeformDETR - ffn_drop=0.0)), # 0.1 for DeformDETR - post_norm_cfg=None), - positional_encoding=dict( - num_feats=128, - normalize=True, - offset=0.0, # -0.5 for DeformDETR - temperature=20), # 10000 for DeformDETR - bbox_head=dict( - type='HybridDINOHead', - num_classes=80, - sync_cls_avg_factor=True, - num_query_one2one=900, - k_one2many=2, - loss_cls=dict( - type='FocalLoss', - use_sigmoid=True, - gamma=2.0, - alpha=0.25, - loss_weight=1.0), # 2.0 in DeformDETR - loss_bbox=dict(type='L1Loss', loss_weight=5.0), - loss_iou=dict(type='GIoULoss', loss_weight=2.0)), - dn_cfg=dict( - label_noise_scale=0.5, - box_noise_scale=1.0, # 0.4 for DN-DETR - group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=100)), - # training and testing settings - train_cfg=dict( - assigner=dict( - type='HungarianAssigner', - match_costs=[ - dict(type='FocalLossCost', weight=2.0), - dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'), - dict(type='IoUCost', iou_mode='giou', weight=2.0) - ])), - test_cfg=dict(max_per_img=300)) # 100 for DeformDETR - -# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different -# from the default setting in mmdet. -train_pipeline = [ - dict(type='LoadImageFromFile', backend_args=_base_.backend_args), - dict(type='LoadAnnotations', with_bbox=True), - dict(type='RandomFlip', prob=0.5), - dict( - type='RandomChoice', - transforms=[ - [ - dict( - type='RandomChoiceResize', - scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), - (608, 1333), (640, 1333), (672, 1333), (704, 1333), - (736, 1333), (768, 1333), (800, 1333)], - keep_ratio=True) - ], - [ - dict( - type='RandomChoiceResize', - # The radio of all image in train dataset < 7 - # follow the original implement - scales=[(400, 4200), (500, 4200), (600, 4200)], - keep_ratio=True), - dict( - type='RandomCrop', - crop_type='absolute_range', - crop_size=(384, 600), - allow_negative_crop=True), - dict( - type='RandomChoiceResize', - scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), - (608, 1333), (640, 1333), (672, 1333), (704, 1333), - (736, 1333), (768, 1333), (800, 1333)], - keep_ratio=True) - ] - ]), - dict(type='PackDetInputs') -] -train_dataloader = dict( - dataset=dict( - filter_cfg=dict(filter_empty_gt=False), pipeline=train_pipeline)) - -# optimizer -optim_wrapper = dict( - type='OptimWrapper', - optimizer=dict( - type='AdamW', - lr=0.0001, # 0.0002 for DeformDETR - weight_decay=0.0001), - clip_grad=dict(max_norm=0.1, norm_type=2), - paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.1)}) -) # custom_keys contains sampling_offsets and reference_points in DeformDETR # noqa - -# learning policy -max_epochs = 12 -train_cfg = dict( - type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) - -val_cfg = dict(type='ValLoop') -test_cfg = dict(type='TestLoop') - -param_scheduler = [ - dict( - type='MultiStepLR', - begin=0, - end=max_epochs, - by_epoch=True, - milestones=[11], - gamma=0.1) -] - -# NOTE: `auto_scale_lr` is for automatically scaling LR, -# USER SHOULD NOT CHANGE ITS VALUES. -# base_batch_size = (8 GPUs) x (2 samples per GPU) -auto_scale_lr = dict(base_batch_size=16) diff --git a/projects/HDINO/h_dino.py b/projects/HDINO/h_dino.py deleted file mode 100644 index 3f9d116d8..000000000 --- a/projects/HDINO/h_dino.py +++ /dev/null @@ -1,149 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from typing import Tuple - -import torch -from torch import Tensor, nn -from torch.nn.init import normal_ - -from mmdet.models.detectors import DINO, DeformableDETR -from mmdet.models.detectors.deformable_detr import \ - MultiScaleDeformableAttention -from mmdet.registry import MODELS -from mmdet.structures import OptSampleList -from mmdet.utils import OptConfigType - - -@MODELS.register_module() -class HDINO(DINO): - - def __init__(self, - *args, - bbox_head: OptConfigType = None, - **kwargs) -> None: - self.method = 0 - self.num_query_one2one = bbox_head['num_query_one2one'] - super(HDINO, self).__init__(*args, bbox_head=bbox_head, **kwargs) - - def _init_layers(self) -> None: - super(HDINO, self)._init_layers() - self.query_embedding = None - if self.method == 1: - self.query_map = nn.Linear(self.embed_dims, self.embed_dims) - else: - self.pos_trans_fc = nn.Linear(self.embed_dims * 2, self.embed_dims) - self.pos_trans_norm = nn.LayerNorm(self.embed_dims) - - def init_weights(self) -> None: - super(DeformableDETR, self).init_weights() - """Initialize weights for Transformer and other components.""" - for coder in self.encoder, self.decoder: - for p in coder.parameters(): - if p.dim() > 1: - nn.init.xavier_uniform_(p) - for m in self.modules(): - if isinstance(m, MultiScaleDeformableAttention): - m.init_weights() - nn.init.xavier_uniform_(self.memory_trans_fc.weight) - normal_(self.level_embed) - - if self.method == 1: - nn.init.xavier_uniform_(self.query_map.weight) - else: - nn.init.xavier_uniform_(self.pos_trans_fc.weight) - - def pre_decoder( - self, - memory: Tensor, - memory_mask: Tensor, - spatial_shapes: Tensor, - batch_data_samples: OptSampleList = None, - ) -> Tuple[dict, dict]: - - bs, _, c = memory.shape - cls_out_features = self.bbox_head.cls_branches[ - self.decoder.num_layers].out_features - - output_memory, output_proposals = self.gen_encoder_output_proposals( - memory, memory_mask, spatial_shapes) - enc_outputs_class = self.bbox_head.cls_branches[ - self.decoder.num_layers]( - output_memory) - enc_outputs_coord_unact = self.bbox_head.reg_branches[ - self.decoder.num_layers](output_memory) + output_proposals - - # NOTE The DINO selects top-k proposals according to scores of - # multi-class classification, while DeformDETR, where the input - # is `enc_outputs_class[..., 0]` selects according to scores of - # binary classification. - topk_indices = torch.topk( - enc_outputs_class.max(-1)[0], k=self.num_queries, dim=1)[1] - topk_score = torch.gather( - enc_outputs_class, 1, - topk_indices.unsqueeze(-1).repeat(1, 1, cls_out_features)) - topk_coords_unact = torch.gather( - enc_outputs_coord_unact, 1, - topk_indices.unsqueeze(-1).repeat(1, 1, 4)) - topk_coords = topk_coords_unact.sigmoid() - topk_coords_unact = topk_coords_unact.detach() - - # We only made changes here. - # ------------------------------------- - if self.method == 1: - map_memory = self.query_map(memory.detach()) - query = torch.gather( - map_memory, 1, - topk_indices.unsqueeze(-1).repeat(1, 1, self.embed_dims)) - else: - pos_trans_out = self.pos_trans_fc( - self.get_proposal_pos_embed(topk_coords_unact)) - query = self.pos_trans_norm(pos_trans_out) - # ------------------------------------- - - if self.training: - dn_label_query, dn_bbox_query, dn_mask, dn_meta = \ - self.dn_query_generator(batch_data_samples) - query = torch.cat([dn_label_query, query], dim=1) - reference_points = torch.cat([dn_bbox_query, topk_coords_unact], - dim=1) - else: - reference_points = topk_coords_unact - dn_mask, dn_meta = None, None - reference_points = reference_points.sigmoid() - - decoder_inputs_dict = dict( - query=query, - memory=memory, - reference_points=reference_points, - dn_mask=dn_mask) - # NOTE DINO calculates encoder losses on scores and coordinates - # of selected top-k encoder queries, while DeformDETR is of all - # encoder queries. - head_inputs_dict = dict( - enc_outputs_class=topk_score, - enc_outputs_coord=topk_coords, - dn_meta=dn_meta) if self.training else dict() - - # We only made changes here. - # ------------------------------------- - if self.training: - # train: num_denoising_queries + num_query_one2one - # + num_query_one2many - dn_mask = decoder_inputs_dict['dn_mask'] - num_denoising_queries = head_inputs_dict['dn_meta'][ - 'num_denoising_queries'] - num_query_one2one = num_denoising_queries + self.num_query_one2one - # dn_mask[num_query_one2one:, :num_query_one2one] = True - dn_mask[num_denoising_queries:num_query_one2one, - num_query_one2one:] = True - decoder_inputs_dict['dn_mask'] = dn_mask - else: - # test: num_query_one2one - # + num_query_one2many - query = decoder_inputs_dict['query'] - reference_points = decoder_inputs_dict['reference_points'] - num_query_one2many = self.num_queries - self.num_query_one2one - decoder_inputs_dict['query'] = query[:num_query_one2many] - decoder_inputs_dict[ - 'reference_points'] = reference_points[:num_query_one2many] - # ------------------------------------- - return decoder_inputs_dict, head_inputs_dict diff --git a/projects/HDINO/h_dino_head.py b/projects/HDINO/h_dino_head.py deleted file mode 100644 index aa1d0867f..000000000 --- a/projects/HDINO/h_dino_head.py +++ /dev/null @@ -1,112 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from typing import Dict, List - -from torch import Tensor - -from mmdet.models.dense_heads.dino_head import DINOHead -from mmdet.models.utils import multi_apply -from mmdet.registry import MODELS -from mmdet.utils import InstanceList, OptInstanceList - - -@MODELS.register_module() -class HybridDINOHead(DINOHead): - """Head of the Hybrid Matching.""" - - def __init__(self, - *args, - num_query_one2one: int = 900, - k_one2many: int = 2, - **kwargs) -> None: - self.num_query_one2one = num_query_one2one - self.k_one2many = k_one2many - super().__init__(*args, **kwargs) - - def loss_by_feat( - self, - all_layers_cls_scores: Tensor, - all_layers_bbox_preds: Tensor, - enc_cls_scores: Tensor, - enc_bbox_preds: Tensor, - batch_gt_instances: InstanceList, - batch_img_metas: List[dict], - dn_meta: Dict[str, int], - batch_gt_instances_ignore: OptInstanceList = None - ) -> Dict[str, Tensor]: - """Loss function. - - Args: - all_layers_cls_scores (Tensor): Classification scores of all - decoder layers, has shape (num_decoder_layers, bs, - num_queries_total, cls_out_channels), where - `num_queries_total` is the sum of `num_denoising_queries` - and `num_matching_queries`. - all_layers_bbox_preds (Tensor): Regression outputs of all decoder - layers. Each is a 4D-tensor with normalized coordinate format - (cx, cy, w, h) and has shape (num_decoder_layers, bs, - num_queries_total, 4). - enc_cls_scores (Tensor): The score of each point on encode - feature map, has shape (bs, num_feat_points, cls_out_channels). - enc_bbox_preds (Tensor): The proposal generate from the encode - feature map, has shape (bs, num_feat_points, 4) with the last - dimension arranged as (cx, cy, w, h). - batch_gt_instances (list[:obj:`InstanceData`]): Batch of - gt_instance. It usually includes ``bboxes`` and ``labels`` - attributes. - batch_img_metas (list[dict]): Meta information of each image, e.g., - image size, scaling factor, etc. - dn_meta (Dict[str, int]): The dictionary saves information about - group collation, including 'num_denoising_queries' and - 'num_denoising_groups'. It will be used for split outputs of - denoising and matching parts and loss calculation. - batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): - Batch of gt_instances_ignore. It includes ``bboxes`` attribute - data that is ignored during training and testing. - Defaults to None. - - Returns: - dict[str, Tensor]: A dictionary of loss components. - """ - # train: num_denoising_queries + num_query_one2one - # + num_query_one2many - num_query_one2one = dn_meta[ - 'num_denoising_queries'] + self.num_query_one2one - outputs_classes_one2one = \ - all_layers_cls_scores[:, :, 0: num_query_one2one, :] - outputs_coords_one2one = \ - all_layers_bbox_preds[:, :, 0: num_query_one2one, :] - # hybrid-matching part - outputs_classes_one2many = \ - all_layers_cls_scores[:, :, num_query_one2one:, :] - outputs_coords_one2many = \ - all_layers_bbox_preds[:, :, num_query_one2one:, :] - - loss_dict = super(HybridDINOHead, self).loss_by_feat( - outputs_classes_one2one, outputs_coords_one2one, enc_cls_scores, - enc_bbox_preds, batch_gt_instances, batch_img_metas, dn_meta, - batch_gt_instances_ignore) - - o2m_batch_gt_instances = [] - for gt_instance in batch_gt_instances: - bboxes = gt_instance.bboxes.repeat(self.k_one2many, 1) - labels = gt_instance.labels.repeat(self.k_one2many) - new_gt_instance = gt_instance.new(bboxes=bboxes, labels=labels) - o2m_batch_gt_instances.append(new_gt_instance) - - losses_cls_o2m, losses_bbox_o2m, losses_iou_o2m = multi_apply( - self.loss_by_feat_single, - outputs_classes_one2many, - outputs_coords_one2many, - batch_gt_instances=o2m_batch_gt_instances, - batch_img_metas=batch_img_metas) - - loss_dict['loss_cls_o2m'] = losses_cls_o2m[-1] - loss_dict['loss_bbox_o2m'] = losses_bbox_o2m[-1] - loss_dict['loss_iou_o2m'] = losses_iou_o2m[-1] - for num_dec_layer, (loss_cls_i, loss_bbox_i, loss_iou_i) in \ - enumerate(zip(losses_cls_o2m[:-1], losses_bbox_o2m[:-1], - losses_iou_o2m[:-1])): - loss_dict[f'd{num_dec_layer}.loss_cls_o2m'] = loss_cls_i - loss_dict[f'd{num_dec_layer}.loss_bbox_o2m'] = loss_bbox_i - loss_dict[f'd{num_dec_layer}.loss_iou_o2m'] = loss_iou_i - return loss_dict diff --git a/projects/LabelStudio/backend_template/_wsgi.py b/projects/LabelStudio/backend_template/_wsgi.py deleted file mode 100644 index 1f8fb68cd..000000000 --- a/projects/LabelStudio/backend_template/_wsgi.py +++ /dev/null @@ -1,145 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import argparse -import json -import logging -import logging.config -import os - -logging.config.dictConfig({ - 'version': 1, - 'formatters': { - 'standard': { - 'format': - '[%(asctime)s] [%(levelname)s] [%(name)s::%(funcName)s::%(lineno)d] %(message)s' # noqa E501 - } - }, - 'handlers': { - 'console': { - 'class': 'logging.StreamHandler', - 'level': 'DEBUG', - 'stream': 'ext://sys.stdout', - 'formatter': 'standard' - } - }, - 'root': { - 'level': 'ERROR', - 'handlers': ['console'], - 'propagate': True - } -}) - -_DEFAULT_CONFIG_PATH = os.path.join(os.path.dirname(__file__), 'config.json') - - -def get_kwargs_from_config(config_path=_DEFAULT_CONFIG_PATH): - if not os.path.exists(config_path): - return dict() - with open(config_path) as f: - config = json.load(f) - assert isinstance(config, dict) - return config - - -if __name__ == '__main__': - - from label_studio_ml.api import init_app - - from projects.LabelStudio.backend_template.mmdetection import MMDetection - - parser = argparse.ArgumentParser(description='Label studio') - parser.add_argument( - '-p', - '--port', - dest='port', - type=int, - default=9090, - help='Server port') - parser.add_argument( - '--host', dest='host', type=str, default='0.0.0.0', help='Server host') - parser.add_argument( - '--kwargs', - '--with', - dest='kwargs', - metavar='KEY=VAL', - nargs='+', - type=lambda kv: kv.split('='), - help='Additional LabelStudioMLBase model initialization kwargs') - parser.add_argument( - '-d', - '--debug', - dest='debug', - action='store_true', - help='Switch debug mode') - parser.add_argument( - '--log-level', - dest='log_level', - choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'], - default=None, - help='Logging level') - parser.add_argument( - '--model-dir', - dest='model_dir', - default=os.path.dirname(__file__), - help='Directory models are store', - ) - parser.add_argument( - '--check', - dest='check', - action='store_true', - help='Validate model instance before launching server') - - args = parser.parse_args() - - # setup logging level - if args.log_level: - logging.root.setLevel(args.log_level) - - def isfloat(value): - try: - float(value) - return True - except ValueError: - return False - - def parse_kwargs(): - param = dict() - for k, v in args.kwargs: - if v.isdigit(): - param[k] = int(v) - elif v == 'True' or v == 'true': - param[k] = True - elif v == 'False' or v == 'False': - param[k] = False - elif isfloat(v): - param[k] = float(v) - else: - param[k] = v - return param - - kwargs = get_kwargs_from_config() - - if args.kwargs: - kwargs.update(parse_kwargs()) - - if args.check: - print('Check "' + MMDetection.__name__ + '" instance creation..') - model = MMDetection(**kwargs) - - app = init_app( - model_class=MMDetection, - model_dir=os.environ.get('MODEL_DIR', args.model_dir), - redis_queue=os.environ.get('RQ_QUEUE_NAME', 'default'), - redis_host=os.environ.get('REDIS_HOST', 'localhost'), - redis_port=os.environ.get('REDIS_PORT', 6379), - **kwargs) - - app.run(host=args.host, port=args.port, debug=args.debug) - -else: - # for uWSGI use - app = init_app( - model_class=MMDetection, - model_dir=os.environ.get('MODEL_DIR', os.path.dirname(__file__)), - redis_queue=os.environ.get('RQ_QUEUE_NAME', 'default'), - redis_host=os.environ.get('REDIS_HOST', 'localhost'), - redis_port=os.environ.get('REDIS_PORT', 6379)) diff --git a/projects/LabelStudio/backend_template/mmdetection.py b/projects/LabelStudio/backend_template/mmdetection.py deleted file mode 100644 index f25e80e8f..000000000 --- a/projects/LabelStudio/backend_template/mmdetection.py +++ /dev/null @@ -1,148 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import io -import json -import logging -import os -from urllib.parse import urlparse - -import boto3 -from botocore.exceptions import ClientError -from label_studio_ml.model import LabelStudioMLBase -from label_studio_ml.utils import (DATA_UNDEFINED_NAME, get_image_size, - get_single_tag_keys) -from label_studio_tools.core.utils.io import get_data_dir - -from mmdet.apis import inference_detector, init_detector - -logger = logging.getLogger(__name__) - - -class MMDetection(LabelStudioMLBase): - """Object detector based on https://github.com/open-mmlab/mmdetection.""" - - def __init__(self, - config_file=None, - checkpoint_file=None, - image_dir=None, - labels_file=None, - score_threshold=0.5, - device='cpu', - **kwargs): - - super(MMDetection, self).__init__(**kwargs) - config_file = config_file or os.environ['config_file'] - checkpoint_file = checkpoint_file or os.environ['checkpoint_file'] - self.config_file = config_file - self.checkpoint_file = checkpoint_file - self.labels_file = labels_file - # default Label Studio image upload folder - upload_dir = os.path.join(get_data_dir(), 'media', 'upload') - self.image_dir = image_dir or upload_dir - logger.debug( - f'{self.__class__.__name__} reads images from {self.image_dir}') - if self.labels_file and os.path.exists(self.labels_file): - self.label_map = json_load(self.labels_file) - else: - self.label_map = {} - - self.from_name, self.to_name, self.value, self.labels_in_config = get_single_tag_keys( # noqa E501 - self.parsed_label_config, 'RectangleLabels', 'Image') - schema = list(self.parsed_label_config.values())[0] - self.labels_in_config = set(self.labels_in_config) - - # Collect label maps from `predicted_values="airplane,car"` attribute in