4 years ago · ba2978839d
--- a/README.md
+++ b/README.md
@@ -0,0 +1,3 @@
 
															+#梅阳阳的学习空间
														
 
															+
														
 
															+###分享源码阅读相关笔记
														
--- a/笔记/BERT.md
+++ b/笔记/BERT.md
@@ -0,0 +1 @@
 
															+#待定
														
--- a/笔记/CNN-Transformer.md
+++ b/笔记/CNN-Transformer.md
@@ -0,0 +1,494 @@
 
															+1.位置向量
														
 
															+```python
														
 
															+class PositionEmbeddingSine(nn.Module):
														
 
															+    """
														
 
															+    This is a more standard version of the position embedding, very similar to the one
														
 
															+    used by the Attention is all you need paper, generalized to work on images.
														
 
															+    """
														
 
															+    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
														
 
															+        super().__init__()
														
 
															+        self.num_pos_feats = num_pos_feats
														
 
															+        self.temperature = temperature
														
 
															+        self.normalize = normalize
														
 
															+        if scale is not None and normalize is False:
														
 
															+            raise ValueError("normalize should be True if scale is passed")
														
 
															+        if scale is None:
														
 
															+            scale = 2 * math.pi
														
 
															+        self.scale = scale
														
 
															+
														
 
															+    def forward(self, tensor_list: NestedTensor):
														
 
															+        x = tensor_list.tensors
														
 
															+        mask = tensor_list.mask
														
 
															+        assert mask is not None
														
 
															+        not_mask = ~mask
														
 
															+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
														
 
															+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
														
 
															+        if self.normalize:
														
 
															+            eps = 1e-6
														
 
															+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
														
 
															+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
														
 
															+
														
 
															+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
														
 
															+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
														
 
															+
														
 
															+        pos_x = x_embed[:, :, :, None] / dim_t
														
 
															+        pos_y = y_embed[:, :, :, None] / dim_t
														
 
															+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
														
 
															+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
														
 
															+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
														
 
															+        return pos
														
 
															+```
														
 
															+2.构造resnet提取cnn特征
														
 
															+```python
														
 
															+class Backbone(BackboneBase):
														
 
															+    """ResNet backbone with frozen BatchNorm."""
														
 
															+    def __init__(self, name: str,
														
 
															+                 train_backbone: bool,
														
 
															+                 return_interm_layers: bool,
														
 
															+                 dilation: bool):
														
 
															+        backbone = getattr(torchvision.models, name)(
														
 
															+            replace_stride_with_dilation=[False, False, dilation],
														
 
															+            pretrained=is_main_process(), norm_layer=FrozenBatchNorm2d)
														
 
															+        num_channels = 512 if name in ('resnet18', 'resnet34') else 2048
														
 
															+        super().__init__(backbone, train_backbone, num_channels, return_interm_layers)
														
 
															+```
														
 
															+
														
 
															+3.transformer使用multihead_attention进行第二次编码，然后继续使用multihead_attention进行解码
														
 
															+
														
 
															+整体流程如下
														
 
															+```python
														
 
															+class Transformer(nn.Module):
														
 
															+
														
 
															+    def __init__(self, d_model=512, nhead=8, num_encoder_layers=6,
														
 
															+                 num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,
														
 
															+                 activation="relu", normalize_before=False,
														
 
															+                 return_intermediate_dec=False):
														
 
															+        super().__init__()
														
 
															+
														
 
															+        encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward,
														
 
															+                                                dropout, activation, normalize_before)
														
 
															+        encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
														
 
															+        self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
														
 
															+
														
 
															+        decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward,
														
 
															+                                                dropout, activation, normalize_before)
														
 
															+        decoder_norm = nn.LayerNorm(d_model)
														
 
															+        self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm,
														
 
															+                                          return_intermediate=return_intermediate_dec)
														
 
															+
														
 
															+        self._reset_parameters()
														
 
															+
														
 
															+        self.d_model = d_model
														
 
															+        self.nhead = nhead
														
 
															+
														
 
															+    def _reset_parameters(self):
														
 
															+        for p in self.parameters():
														
 
															+            if p.dim() > 1:
														
 
															+                nn.init.xavier_uniform_(p)
														
 
															+
														
 
															+    def forward(self, src, mask, query_embed, pos_embed):
														
 
															+        # flatten NxCxHxW to HWxNxC
														
 
															+        bs, c, h, w = src.shape
														
 
															+        src = src.flatten(2).permute(2, 0, 1)
														
 
															+        pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
														
 
															+        query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)
														
 
															+        mask = mask.flatten(1)
														
 
															+
														
 
															+        tgt = torch.zeros_like(query_embed)
														
 
															+        memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)
														
 
															+        hs = self.decoder(tgt, memory, memory_key_padding_mask=mask,
														
 
															+                          pos=pos_embed, query_pos=query_embed)
														
 
															+        return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w)
														
 
															+
														
 
															+```
														
 
															+
														
 
															+
														
 
															+第二次编码过程如下
														
 
															+```python
														
 
															+class TransformerEncoderLayer(nn.Module):
														
 
															+
														
 
															+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
														
 
															+                 activation="relu", normalize_before=False):
														
 
															+        super().__init__()
														
 
															+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
														
 
															+        # Implementation of Feedforward model
														
 
															+        self.linear1 = nn.Linear(d_model, dim_feedforward)
														
 
															+        self.dropout = nn.Dropout(dropout)
														
 
															+        self.linear2 = nn.Linear(dim_feedforward, d_model)
														
 
															+
														
 
															+        self.norm1 = nn.LayerNorm(d_model)
														
 
															+        self.norm2 = nn.LayerNorm(d_model)
														
 
															+        self.dropout1 = nn.Dropout(dropout)
														
 
															+        self.dropout2 = nn.Dropout(dropout)
														
 
															+
														
 
															+        self.activation = _get_activation_fn(activation)
														
 
															+        self.normalize_before = normalize_before
														
 
															+
														
 
															+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
														
 
															+        return tensor if pos is None else tensor + pos
														
 
															+
														
 
															+    def forward_post(self,
														
 
															+                     src,
														
 
															+                     src_mask: Optional[Tensor] = None,
														
 
															+                     src_key_padding_mask: Optional[Tensor] = None,
														
 
															+                     pos: Optional[Tensor] = None):
														
 
															+        q = k = self.with_pos_embed(src, pos)
														
 
															+        src2 = self.self_attn(q, k, value=src, attn_mask=src_mask,
														
 
															+                              key_padding_mask=src_key_padding_mask)[0]
														
 
															+        src = src + self.dropout1(src2)
														
 
															+        src = self.norm1(src)
														
 
															+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
														
 
															+        src = src + self.dropout2(src2)
														
 
															+        src = self.norm2(src)
														
 
															+        return src
														
 
															+
														
 
															+    def forward_pre(self, src,
														
 
															+                    src_mask: Optional[Tensor] = None,
														
 
															+                    src_key_padding_mask: Optional[Tensor] = None,
														
 
															+                    pos: Optional[Tensor] = None):
														
 
															+        src2 = self.norm1(src)
														
 
															+        q = k = self.with_pos_embed(src2, pos)
														
 
															+        src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask,
														
 
															+                              key_padding_mask=src_key_padding_mask)[0]
														
 
															+        src = src + self.dropout1(src2)
														
 
															+        src2 = self.norm2(src)
														
 
															+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
														
 
															+        src = src + self.dropout2(src2)
														
 
															+        return src
														
 
															+
														
 
															+    def forward(self, src,
														
 
															+                src_mask: Optional[Tensor] = None,
														
 
															+                src_key_padding_mask: Optional[Tensor] = None,
														
 
															+                pos: Optional[Tensor] = None):
														
 
															+        if self.normalize_before:
														
 
															+            return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
														
 
															+        return self.forward_post(src, src_mask, src_key_padding_mask, pos)
														
 
															+```
														
 
															+
														
 
															+```python
														
 
															+class TransformerEncoder(nn.Module):
														
 
															+
														
 
															+    def __init__(self, encoder_layer, num_layers, norm=None):
														
 
															+        super().__init__()
														
 
															+        self.layers = _get_clones(encoder_layer, num_layers)
														
 
															+        self.num_layers = num_layers
														
 
															+        self.norm = norm
														
 
															+
														
 
															+    def forward(self, src,
														
 
															+                mask: Optional[Tensor] = None,
														
 
															+                src_key_padding_mask: Optional[Tensor] = None,
														
 
															+                pos: Optional[Tensor] = None):
														
 
															+        output = src
														
 
															+
														
 
															+        for layer in self.layers:
														
 
															+            output = layer(output, src_mask=mask,
														
 
															+                           src_key_padding_mask=src_key_padding_mask, pos=pos)
														
 
															+
														
 
															+        if self.norm is not None:
														
 
															+            output = self.norm(output)
														
 
															+
														
 
															+        return output
														
 
															+```
														
 
															+
														
 
															+
														
 
															+编码细节
														
 
															+```python
														
 
															+q = k = self.with_pos_embed(src2, pos)
														
 
															+src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask,
														
 
															+                      key_padding_mask=src_key_padding_mask)[0]
														
 
															+```
														
 
															+
														
 
															+
														
 
															+解码过程如下
														
 
															+```python
														
 
															+class TransformerDecoderLayer(nn.Module):
														
 
															+
														
 
															+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
														
 
															+                 activation="relu", normalize_before=False):
														
 
															+        super().__init__()
														
 
															+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
														
 
															+        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
														
 
															+        # Implementation of Feedforward model
														
 
															+        self.linear1 = nn.Linear(d_model, dim_feedforward)
														
 
															+        self.dropout = nn.Dropout(dropout)
														
 
															+        self.linear2 = nn.Linear(dim_feedforward, d_model)
														
 
															+
														
 
															+        self.norm1 = nn.LayerNorm(d_model)
														
 
															+        self.norm2 = nn.LayerNorm(d_model)
														
 
															+        self.norm3 = nn.LayerNorm(d_model)
														
 
															+        self.dropout1 = nn.Dropout(dropout)
														
 
															+        self.dropout2 = nn.Dropout(dropout)
														
 
															+        self.dropout3 = nn.Dropout(dropout)
														
 
															+
														
 
															+        self.activation = _get_activation_fn(activation)
														
 
															+        self.normalize_before = normalize_before
														
 
															+
														
 
															+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
														
 
															+        return tensor if pos is None else tensor + pos
														
 
															+
														
 
															+    def forward_post(self, tgt, memory,
														
 
															+                     tgt_mask: Optional[Tensor] = None,
														
 
															+                     memory_mask: Optional[Tensor] = None,
														
 
															+                     tgt_key_padding_mask: Optional[Tensor] = None,
														
 
															+                     memory_key_padding_mask: Optional[Tensor] = None,
														
 
															+                     pos: Optional[Tensor] = None,
														
 
															+                     query_pos: Optional[Tensor] = None):
														
 
															+        q = k = self.with_pos_embed(tgt, query_pos)
														
 
															+        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask,
														
 
															+                              key_padding_mask=tgt_key_padding_mask)[0]
														
 
															+        tgt = tgt + self.dropout1(tgt2)
														
 
															+        tgt = self.norm1(tgt)
														
 
															+        tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
														
 
															+                                   key=self.with_pos_embed(memory, pos),
														
 
															+                                   value=memory, attn_mask=memory_mask,
														
 
															+                                   key_padding_mask=memory_key_padding_mask)[0]
														
 
															+        tgt = tgt + self.dropout2(tgt2)
														
 
															+        tgt = self.norm2(tgt)
														
 
															+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
														
 
															+        tgt = tgt + self.dropout3(tgt2)
														
 
															+        tgt = self.norm3(tgt)
														
 
															+        return tgt
														
 
															+
														
 
															+    def forward_pre(self, tgt, memory,
														
 
															+                    tgt_mask: Optional[Tensor] = None,
														
 
															+                    memory_mask: Optional[Tensor] = None,
														
 
															+                    tgt_key_padding_mask: Optional[Tensor] = None,
														
 
															+                    memory_key_padding_mask: Optional[Tensor] = None,
														
 
															+                    pos: Optional[Tensor] = None,
														
 
															+                    query_pos: Optional[Tensor] = None):
														
 
															+        tgt2 = self.norm1(tgt)
														
 
															+        q = k = self.with_pos_embed(tgt2, query_pos)
														
 
															+        tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
														
 
															+                              key_padding_mask=tgt_key_padding_mask)[0]
														
 
															+        tgt = tgt + self.dropout1(tgt2)
														
 
															+        tgt2 = self.norm2(tgt)
														
 
															+        tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
														
 
															+                                   key=self.with_pos_embed(memory, pos),
														
 
															+                                   value=memory, attn_mask=memory_mask,
														
 
															+                                   key_padding_mask=memory_key_padding_mask)[0]
														
 
															+        tgt = tgt + self.dropout2(tgt2)
														
 
															+        tgt2 = self.norm3(tgt)
														
 
															+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
														
 
															+        tgt = tgt + self.dropout3(tgt2)
														
 
															+        return tgt
														
 
															+
														
 
															+    def forward(self, tgt, memory,
														
 
															+                tgt_mask: Optional[Tensor] = None,
														
 
															+                memory_mask: Optional[Tensor] = None,
														
 
															+                tgt_key_padding_mask: Optional[Tensor] = None,
														
 
															+                memory_key_padding_mask: Optional[Tensor] = None,
														
 
															+                pos: Optional[Tensor] = None,
														
 
															+                query_pos: Optional[Tensor] = None):
														
 
															+        if self.normalize_before:
														
 
															+            return self.forward_pre(tgt, memory, tgt_mask, memory_mask,
														
 
															+                                    tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
														
 
															+        return self.forward_post(tgt, memory, tgt_mask, memory_mask,
														
 
															+                                 tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
														
 
															+
														
 
															+```
														
 
															+
														
 
															+```python
														
 
															+class TransformerDecoder(nn.Module):
														
 
															+
														
 
															+    def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
														
 
															+        super().__init__()
														
 
															+        self.layers = _get_clones(decoder_layer, num_layers)
														
 
															+        self.num_layers = num_layers
														
 
															+        self.norm = norm
														
 
															+        self.return_intermediate = return_intermediate
														
 
															+
														
 
															+    def forward(self, tgt, memory,
														
 
															+                tgt_mask: Optional[Tensor] = None,
														
 
															+                memory_mask: Optional[Tensor] = None,
														
 
															+                tgt_key_padding_mask: Optional[Tensor] = None,
														
 
															+                memory_key_padding_mask: Optional[Tensor] = None,
														
 
															+                pos: Optional[Tensor] = None,
														
 
															+                query_pos: Optional[Tensor] = None):
														
 
															+        output = tgt
														
 
															+
														
 
															+        intermediate = []
														
 
															+
														
 
															+        for layer in self.layers:
														
 
															+            output = layer(output, memory, tgt_mask=tgt_mask,
														
 
															+                           memory_mask=memory_mask,
														
 
															+                           tgt_key_padding_mask=tgt_key_padding_mask,
														
 
															+                           memory_key_padding_mask=memory_key_padding_mask,
														
 
															+                           pos=pos, query_pos=query_pos)
														
 
															+            if self.return_intermediate:
														
 
															+                intermediate.append(self.norm(output))
														
 
															+
														
 
															+        if self.norm is not None:
														
 
															+            output = self.norm(output)
														
 
															+            if self.return_intermediate:
														
 
															+                intermediate.pop()
														
 
															+                intermediate.append(output)
														
 
															+
														
 
															+        if self.return_intermediate:
														
 
															+            return torch.stack(intermediate)
														
 
															+
														
 
															+        return output.unsqueeze(0)
														
 
															+```
														
 
															+
														
 
															+
														
 
															+
														
 
															+4.前向计算
														
 
															+```python
														
 
															+class DETR(nn.Module):
														
 
															+    """ This is the DETR module that performs object detection """
														
 
															+    def __init__(self, backbone, transformer, num_classes, num_queries, aux_loss=False):
														
 
															+        """ Initializes the model.
														
 
															+        Parameters:
														
 
															+            backbone: torch module of the backbone to be used. See backbone.py
														
 
															+            transformer: torch module of the transformer architecture. See transformer.py
														
 
															+            num_classes: number of object classes
														
 
															+            num_queries: number of object queries, ie detection slot. This is the maximal number of objects
														
 
															+                         DETR can detect in a single image. For COCO, we recommend 100 queries.
														
 
															+            aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
														
 
															+        """
														
 
															+        super().__init__()
														
 
															+        self.num_queries = num_queries
														
 
															+        self.transformer = transformer
														
 
															+        hidden_dim = transformer.d_model
														
 
															+        self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
														
 
															+        self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
														
 
															+        self.query_embed = nn.Embedding(num_queries, hidden_dim)
														
 
															+        self.input_proj = nn.Conv2d(backbone.num_channels, hidden_dim, kernel_size=1)
														
 
															+        self.backbone = backbone
														
 
															+        self.aux_loss = aux_loss
														
 
															+
														
 
															+    def forward(self, samples: NestedTensor):
														
 
															+        """ The forward expects a NestedTensor, which consists of:
														
 
															+               - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
														
 
															+               - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
														
 
															+
														
 
															+            It returns a dict with the following elements:
														
 
															+               - "pred_logits": the classification logits (including no-object) for all queries.
														
 
															+                                Shape= [batch_size x num_queries x (num_classes + 1)]
														
 
															+               - "pred_boxes": The normalized boxes coordinates for all queries, represented as
														
 
															+                               (center_x, center_y, height, width). These values are normalized in [0, 1],
														
 
															+                               relative to the size of each individual image (disregarding possible padding).
														
 
															+                               See PostProcess for information on how to retrieve the unnormalized bounding box.
														
 
															+               - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
														
 
															+                                dictionnaries containing the two above keys for each decoder layer.
														
 
															+        """
														
 
															+        if isinstance(samples, (list, torch.Tensor)):
														
 
															+            samples = nested_tensor_from_tensor_list(samples)
														
 
															+        features, pos = self.backbone(samples)
														
 
															+
														
 
															+        src, mask = features[-1].decompose()
														
 
															+        assert mask is not None
														
 
															+        hs = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos[-1])[0]
														
 
															+
														
 
															+        outputs_class = self.class_embed(hs)
														
 
															+        outputs_coord = self.bbox_embed(hs).sigmoid()
														
 
															+        out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1]}
														
 
															+        if self.aux_loss:
														
 
															+            out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord)
														
 
															+        return out
														
 
															+
														
 
															+    @torch.jit.unused
														
 
															+    def _set_aux_loss(self, outputs_class, outputs_coord):
														
 
															+        # this is a workaround to make torchscript happy, as torchscript
														
 
															+        # doesn't support dictionary with non-homogeneous values, such
														
 
															+        # as a dict having both a Tensor and a list.
														
 
															+        return [{'pred_logits': a, 'pred_boxes': b}
														
 
															+                for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
														
 
															+```
														
 
															+
														
 
															+5.计算损失
														
 
															+首先把hs转化为 目标的类别
														
 
															+```
														
 
															+self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
														
 
															+self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
														
 
															+
														
 
															+class MLP(nn.Module):
														
 
															+    """ Very simple multi-layer perceptron (also called FFN)"""
														
 
															+
														
 
															+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
														
 
															+        super().__init__()
														
 
															+        self.num_layers = num_layers
														
 
															+        h = [hidden_dim] * (num_layers - 1)
														
 
															+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
														
 
															+
														
 
															+    def forward(self, x):
														
 
															+        for i, layer in enumerate(self.layers):
														
 
															+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
														
 
															+        return x
														
 
															+```
														
 
															+  
														
 
															+
														
 
															+```python
														
 
															+class HungarianMatcher(nn.Module):
														
 
															+    """This class computes an assignment between the targets and the predictions of the network
														
 
															+
														
 
															+    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
														
 
															+    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
														
 
															+    while the others are un-matched (and thus treated as non-objects).
														
 
															+    """
														
 
															+
														
 
															+    def __init__(self, cost_class: float = 1, cost_bbox: float = 1, cost_giou: float = 1):
														
 
															+        """Creates the matcher
														
 
															+
														
 
															+        Params:
														
 
															+            cost_class: This is the relative weight of the classification error in the matching cost
														
 
															+            cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
														
 
															+            cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
														
 
															+        """
														
 
															+        super().__init__()
														
 
															+        self.cost_class = cost_class
														
 
															+        self.cost_bbox = cost_bbox
														
 
															+        self.cost_giou = cost_giou
														
 
															+        assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0"
														
 
															+
														
 
															+    @torch.no_grad()
														
 
															+    def forward(self, outputs, targets):
														
 
															+        """ Performs the matching
														
 
															+
														
 
															+        Params:
														
 
															+            outputs: This is a dict that contains at least these entries:
														
 
															+                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
														
 
															+                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
														
 
															+
														
 
															+            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
														
 
															+                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
														
 
															+                           objects in the target) containing the class labels
														
 
															+                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
														
 
															+
														
 
															+        Returns:
														
 
															+            A list of size batch_size, containing tuples of (index_i, index_j) where:
														
 
															+                - index_i is the indices of the selected predictions (in order)
														
 
															+                - index_j is the indices of the corresponding selected targets (in order)
														
 
															+            For each batch element, it holds:
														
 
															+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
														
 
															+        """
														
 
															+        bs, num_queries = outputs["pred_logits"].shape[:2]
														
 
															+
														
 
															+        # We flatten to compute the cost matrices in a batch
														
 
															+        out_prob = outputs["pred_logits"].flatten(0, 1).softmax(-1)  # [batch_size * num_queries, num_classes]
														
 
															+        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
														
 
															+
														
 
															+        # Also concat the target labels and boxes
														
 
															+        tgt_ids = torch.cat([v["labels"] for v in targets])
														
 
															+        tgt_bbox = torch.cat([v["boxes"] for v in targets])
														
 
															+
														
 
															+        # Compute the classification cost. Contrary to the loss, we don't use the NLL,
														
 
															+        # but approximate it in 1 - proba[target class].
														
 
															+        # The 1 is a constant that doesn't change the matching, it can be ommitted.
														
 
															+        cost_class = -out_prob[:, tgt_ids]
														
 
															+
														
 
															+        # Compute the L1 cost between boxes
														
 
															+        cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
														
 
															+
														
 
															+        # Compute the giou cost betwen boxes
														
 
															+        cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
														
 
															+
														
 
															+        # Final cost matrix
														
 
															+        C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
														
 
															+        C = C.view(bs, num_queries, -1).cpu()
														
 
															+
														
 
															+        sizes = [len(v["boxes"]) for v in targets]
														
 
															+        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
														
 
															+        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
														
 
															+```
														
 
															+
														
 
															+