há 4 anos atrás · ba2978839d
--- a/README.md
+++ b/README.md
@@ -0,0 +1,3 @@
 
				+#梅阳阳的学习空间
			
 
				+
			
 
				+###分享源码阅读相关笔记
			
--- a/笔记/BERT.md
+++ b/笔记/BERT.md
@@ -0,0 +1 @@
 
				+#待定
			
--- a/笔记/CNN-Transformer.md
+++ b/笔记/CNN-Transformer.md
@@ -0,0 +1,494 @@
 
				+1.位置向量
			
 
				+```python
			
 
				+class PositionEmbeddingSine(nn.Module):
			
 
				+    """
			
 
				+    This is a more standard version of the position embedding, very similar to the one
			
 
				+    used by the Attention is all you need paper, generalized to work on images.
			
 
				+    """
			
 
				+    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
			
 
				+        super().__init__()
			
 
				+        self.num_pos_feats = num_pos_feats
			
 
				+        self.temperature = temperature
			
 
				+        self.normalize = normalize
			
 
				+        if scale is not None and normalize is False:
			
 
				+            raise ValueError("normalize should be True if scale is passed")
			
 
				+        if scale is None:
			
 
				+            scale = 2 * math.pi
			
 
				+        self.scale = scale
			
 
				+
			
 
				+    def forward(self, tensor_list: NestedTensor):
			
 
				+        x = tensor_list.tensors
			
 
				+        mask = tensor_list.mask
			
 
				+        assert mask is not None
			
 
				+        not_mask = ~mask
			
 
				+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
			
 
				+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
			
 
				+        if self.normalize:
			
 
				+            eps = 1e-6
			
 
				+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
			
 
				+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
			
 
				+
			
 
				+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
			
 
				+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
			
 
				+
			
 
				+        pos_x = x_embed[:, :, :, None] / dim_t
			
 
				+        pos_y = y_embed[:, :, :, None] / dim_t
			
 
				+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
			
 
				+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
			
 
				+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
			
 
				+        return pos
			
 
				+```
			
 
				+2.构造resnet提取cnn特征
			
 
				+```python
			
 
				+class Backbone(BackboneBase):
			
 
				+    """ResNet backbone with frozen BatchNorm."""
			
 
				+    def __init__(self, name: str,
			
 
				+                 train_backbone: bool,
			
 
				+                 return_interm_layers: bool,
			
 
				+                 dilation: bool):
			
 
				+        backbone = getattr(torchvision.models, name)(
			
 
				+            replace_stride_with_dilation=[False, False, dilation],
			
 
				+            pretrained=is_main_process(), norm_layer=FrozenBatchNorm2d)
			
 
				+        num_channels = 512 if name in ('resnet18', 'resnet34') else 2048
			
 
				+        super().__init__(backbone, train_backbone, num_channels, return_interm_layers)
			
 
				+```
			
 
				+
			
 
				+3.transformer使用multihead_attention进行第二次编码，然后继续使用multihead_attention进行解码
			
 
				+
			
 
				+整体流程如下
			
 
				+```python
			
 
				+class Transformer(nn.Module):
			
 
				+
			
 
				+    def __init__(self, d_model=512, nhead=8, num_encoder_layers=6,
			
 
				+                 num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,
			
 
				+                 activation="relu", normalize_before=False,
			
 
				+                 return_intermediate_dec=False):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward,
			
 
				+                                                dropout, activation, normalize_before)
			
 
				+        encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
			
 
				+        self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
			
 
				+
			
 
				+        decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward,
			
 
				+                                                dropout, activation, normalize_before)
			
 
				+        decoder_norm = nn.LayerNorm(d_model)
			
 
				+        self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm,
			
 
				+                                          return_intermediate=return_intermediate_dec)
			
 
				+
			
 
				+        self._reset_parameters()
			
 
				+
			
 
				+        self.d_model = d_model
			
 
				+        self.nhead = nhead
			
 
				+
			
 
				+    def _reset_parameters(self):
			
 
				+        for p in self.parameters():
			
 
				+            if p.dim() > 1:
			
 
				+                nn.init.xavier_uniform_(p)
			
 
				+
			
 
				+    def forward(self, src, mask, query_embed, pos_embed):
			
 
				+        # flatten NxCxHxW to HWxNxC
			
 
				+        bs, c, h, w = src.shape
			
 
				+        src = src.flatten(2).permute(2, 0, 1)
			
 
				+        pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
			
 
				+        query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)
			
 
				+        mask = mask.flatten(1)
			
 
				+
			
 
				+        tgt = torch.zeros_like(query_embed)
			
 
				+        memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)
			
 
				+        hs = self.decoder(tgt, memory, memory_key_padding_mask=mask,
			
 
				+                          pos=pos_embed, query_pos=query_embed)
			
 
				+        return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w)
			
 
				+
			
 
				+```
			
 
				+
			
 
				+
			
 
				+第二次编码过程如下
			
 
				+```python
			
 
				+class TransformerEncoderLayer(nn.Module):
			
 
				+
			
 
				+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
			
 
				+                 activation="relu", normalize_before=False):
			
 
				+        super().__init__()
			
 
				+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
			
 
				+        # Implementation of Feedforward model
			
 
				+        self.linear1 = nn.Linear(d_model, dim_feedforward)
			
 
				+        self.dropout = nn.Dropout(dropout)
			
 
				+        self.linear2 = nn.Linear(dim_feedforward, d_model)
			
 
				+
			
 
				+        self.norm1 = nn.LayerNorm(d_model)
			
 
				+        self.norm2 = nn.LayerNorm(d_model)
			
 
				+        self.dropout1 = nn.Dropout(dropout)
			
 
				+        self.dropout2 = nn.Dropout(dropout)
			
 
				+
			
 
				+        self.activation = _get_activation_fn(activation)
			
 
				+        self.normalize_before = normalize_before
			
 
				+
			
 
				+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
			
 
				+        return tensor if pos is None else tensor + pos
			
 
				+
			
 
				+    def forward_post(self,
			
 
				+                     src,
			
 
				+                     src_mask: Optional[Tensor] = None,
			
 
				+                     src_key_padding_mask: Optional[Tensor] = None,
			
 
				+                     pos: Optional[Tensor] = None):
			
 
				+        q = k = self.with_pos_embed(src, pos)
			
 
				+        src2 = self.self_attn(q, k, value=src, attn_mask=src_mask,
			
 
				+                              key_padding_mask=src_key_padding_mask)[0]
			
 
				+        src = src + self.dropout1(src2)
			
 
				+        src = self.norm1(src)
			
 
				+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
			
 
				+        src = src + self.dropout2(src2)
			
 
				+        src = self.norm2(src)
			
 
				+        return src
			
 
				+
			
 
				+    def forward_pre(self, src,
			
 
				+                    src_mask: Optional[Tensor] = None,
			
 
				+                    src_key_padding_mask: Optional[Tensor] = None,
			
 
				+                    pos: Optional[Tensor] = None):
			
 
				+        src2 = self.norm1(src)
			
 
				+        q = k = self.with_pos_embed(src2, pos)
			
 
				+        src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask,
			
 
				+                              key_padding_mask=src_key_padding_mask)[0]
			
 
				+        src = src + self.dropout1(src2)
			
 
				+        src2 = self.norm2(src)
			
 
				+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
			
 
				+        src = src + self.dropout2(src2)
			
 
				+        return src
			
 
				+
			
 
				+    def forward(self, src,
			
 
				+                src_mask: Optional[Tensor] = None,
			
 
				+                src_key_padding_mask: Optional[Tensor] = None,
			
 
				+                pos: Optional[Tensor] = None):
			
 
				+        if self.normalize_before:
			
 
				+            return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
			
 
				+        return self.forward_post(src, src_mask, src_key_padding_mask, pos)
			
 
				+```
			
 
				+
			
 
				+```python
			
 
				+class TransformerEncoder(nn.Module):
			
 
				+
			
 
				+    def __init__(self, encoder_layer, num_layers, norm=None):
			
 
				+        super().__init__()
			
 
				+        self.layers = _get_clones(encoder_layer, num_layers)
			
 
				+        self.num_layers = num_layers
			
 
				+        self.norm = norm
			
 
				+
			
 
				+    def forward(self, src,
			
 
				+                mask: Optional[Tensor] = None,
			
 
				+                src_key_padding_mask: Optional[Tensor] = None,
			
 
				+                pos: Optional[Tensor] = None):
			
 
				+        output = src
			
 
				+
			
 
				+        for layer in self.layers:
			
 
				+            output = layer(output, src_mask=mask,
			
 
				+                           src_key_padding_mask=src_key_padding_mask, pos=pos)
			
 
				+
			
 
				+        if self.norm is not None:
			
 
				+            output = self.norm(output)
			
 
				+
			
 
				+        return output
			
 
				+```
			
 
				+
			
 
				+
			
 
				+编码细节
			
 
				+```python
			
 
				+q = k = self.with_pos_embed(src2, pos)
			
 
				+src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask,
			
 
				+                      key_padding_mask=src_key_padding_mask)[0]
			
 
				+```
			
 
				+
			
 
				+
			
 
				+解码过程如下
			
 
				+```python
			
 
				+class TransformerDecoderLayer(nn.Module):
			
 
				+
			
 
				+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
			
 
				+                 activation="relu", normalize_before=False):
			
 
				+        super().__init__()
			
 
				+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
			
 
				+        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
			
 
				+        # Implementation of Feedforward model
			
 
				+        self.linear1 = nn.Linear(d_model, dim_feedforward)
			
 
				+        self.dropout = nn.Dropout(dropout)
			
 
				+        self.linear2 = nn.Linear(dim_feedforward, d_model)
			
 
				+
			
 
				+        self.norm1 = nn.LayerNorm(d_model)
			
 
				+        self.norm2 = nn.LayerNorm(d_model)
			
 
				+        self.norm3 = nn.LayerNorm(d_model)
			
 
				+        self.dropout1 = nn.Dropout(dropout)
			
 
				+        self.dropout2 = nn.Dropout(dropout)
			
 
				+        self.dropout3 = nn.Dropout(dropout)
			
 
				+
			
 
				+        self.activation = _get_activation_fn(activation)
			
 
				+        self.normalize_before = normalize_before
			
 
				+
			
 
				+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
			
 
				+        return tensor if pos is None else tensor + pos
			
 
				+
			
 
				+    def forward_post(self, tgt, memory,
			
 
				+                     tgt_mask: Optional[Tensor] = None,
			
 
				+                     memory_mask: Optional[Tensor] = None,
			
 
				+                     tgt_key_padding_mask: Optional[Tensor] = None,
			
 
				+                     memory_key_padding_mask: Optional[Tensor] = None,
			
 
				+                     pos: Optional[Tensor] = None,
			
 
				+                     query_pos: Optional[Tensor] = None):
			
 
				+        q = k = self.with_pos_embed(tgt, query_pos)
			
 
				+        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask,
			
 
				+                              key_padding_mask=tgt_key_padding_mask)[0]
			
 
				+        tgt = tgt + self.dropout1(tgt2)
			
 
				+        tgt = self.norm1(tgt)
			
 
				+        tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
			
 
				+                                   key=self.with_pos_embed(memory, pos),
			
 
				+                                   value=memory, attn_mask=memory_mask,
			
 
				+                                   key_padding_mask=memory_key_padding_mask)[0]
			
 
				+        tgt = tgt + self.dropout2(tgt2)
			
 
				+        tgt = self.norm2(tgt)
			
 
				+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
			
 
				+        tgt = tgt + self.dropout3(tgt2)
			
 
				+        tgt = self.norm3(tgt)
			
 
				+        return tgt
			
 
				+
			
 
				+    def forward_pre(self, tgt, memory,
			
 
				+                    tgt_mask: Optional[Tensor] = None,
			
 
				+                    memory_mask: Optional[Tensor] = None,
			
 
				+                    tgt_key_padding_mask: Optional[Tensor] = None,
			
 
				+                    memory_key_padding_mask: Optional[Tensor] = None,
			
 
				+                    pos: Optional[Tensor] = None,
			
 
				+                    query_pos: Optional[Tensor] = None):
			
 
				+        tgt2 = self.norm1(tgt)
			
 
				+        q = k = self.with_pos_embed(tgt2, query_pos)
			
 
				+        tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
			
 
				+                              key_padding_mask=tgt_key_padding_mask)[0]
			
 
				+        tgt = tgt + self.dropout1(tgt2)
			
 
				+        tgt2 = self.norm2(tgt)
			
 
				+        tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
			
 
				+                                   key=self.with_pos_embed(memory, pos),
			
 
				+                                   value=memory, attn_mask=memory_mask,
			
 
				+                                   key_padding_mask=memory_key_padding_mask)[0]
			
 
				+        tgt = tgt + self.dropout2(tgt2)
			
 
				+        tgt2 = self.norm3(tgt)
			
 
				+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
			
 
				+        tgt = tgt + self.dropout3(tgt2)
			
 
				+        return tgt
			
 
				+
			
 
				+    def forward(self, tgt, memory,
			
 
				+                tgt_mask: Optional[Tensor] = None,
			
 
				+                memory_mask: Optional[Tensor] = None,
			
 
				+                tgt_key_padding_mask: Optional[Tensor] = None,
			
 
				+                memory_key_padding_mask: Optional[Tensor] = None,
			
 
				+                pos: Optional[Tensor] = None,
			
 
				+                query_pos: Optional[Tensor] = None):
			
 
				+        if self.normalize_before:
			
 
				+            return self.forward_pre(tgt, memory, tgt_mask, memory_mask,
			
 
				+                                    tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
			
 
				+        return self.forward_post(tgt, memory, tgt_mask, memory_mask,
			
 
				+                                 tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
			
 
				+
			
 
				+```
			
 
				+
			
 
				+```python
			
 
				+class TransformerDecoder(nn.Module):
			
 
				+
			
 
				+    def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
			
 
				+        super().__init__()
			
 
				+        self.layers = _get_clones(decoder_layer, num_layers)
			
 
				+        self.num_layers = num_layers
			
 
				+        self.norm = norm
			
 
				+        self.return_intermediate = return_intermediate
			
 
				+
			
 
				+    def forward(self, tgt, memory,
			
 
				+                tgt_mask: Optional[Tensor] = None,
			
 
				+                memory_mask: Optional[Tensor] = None,
			
 
				+                tgt_key_padding_mask: Optional[Tensor] = None,
			
 
				+                memory_key_padding_mask: Optional[Tensor] = None,
			
 
				+                pos: Optional[Tensor] = None,
			
 
				+                query_pos: Optional[Tensor] = None):
			
 
				+        output = tgt
			
 
				+
			
 
				+        intermediate = []
			
 
				+
			
 
				+        for layer in self.layers:
			
 
				+            output = layer(output, memory, tgt_mask=tgt_mask,
			
 
				+                           memory_mask=memory_mask,
			
 
				+                           tgt_key_padding_mask=tgt_key_padding_mask,
			
 
				+                           memory_key_padding_mask=memory_key_padding_mask,
			
 
				+                           pos=pos, query_pos=query_pos)
			
 
				+            if self.return_intermediate:
			
 
				+                intermediate.append(self.norm(output))
			
 
				+
			
 
				+        if self.norm is not None:
			
 
				+            output = self.norm(output)
			
 
				+            if self.return_intermediate:
			
 
				+                intermediate.pop()
			
 
				+                intermediate.append(output)
			
 
				+
			
 
				+        if self.return_intermediate:
			
 
				+            return torch.stack(intermediate)
			
 
				+
			
 
				+        return output.unsqueeze(0)
			
 
				+```
			
 
				+
			
 
				+
			
 
				+
			
 
				+4.前向计算
			
 
				+```python
			
 
				+class DETR(nn.Module):
			
 
				+    """ This is the DETR module that performs object detection """
			
 
				+    def __init__(self, backbone, transformer, num_classes, num_queries, aux_loss=False):
			
 
				+        """ Initializes the model.
			
 
				+        Parameters:
			
 
				+            backbone: torch module of the backbone to be used. See backbone.py
			
 
				+            transformer: torch module of the transformer architecture. See transformer.py
			
 
				+            num_classes: number of object classes
			
 
				+            num_queries: number of object queries, ie detection slot. This is the maximal number of objects
			
 
				+                         DETR can detect in a single image. For COCO, we recommend 100 queries.
			
 
				+            aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
			
 
				+        """
			
 
				+        super().__init__()
			
 
				+        self.num_queries = num_queries
			
 
				+        self.transformer = transformer
			
 
				+        hidden_dim = transformer.d_model
			
 
				+        self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
			
 
				+        self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
			
 
				+        self.query_embed = nn.Embedding(num_queries, hidden_dim)
			
 
				+        self.input_proj = nn.Conv2d(backbone.num_channels, hidden_dim, kernel_size=1)
			
 
				+        self.backbone = backbone
			
 
				+        self.aux_loss = aux_loss
			
 
				+
			
 
				+    def forward(self, samples: NestedTensor):
			
 
				+        """ The forward expects a NestedTensor, which consists of:
			
 
				+               - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
			
 
				+               - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
			
 
				+
			
 
				+            It returns a dict with the following elements:
			
 
				+               - "pred_logits": the classification logits (including no-object) for all queries.
			
 
				+                                Shape= [batch_size x num_queries x (num_classes + 1)]
			
 
				+               - "pred_boxes": The normalized boxes coordinates for all queries, represented as
			
 
				+                               (center_x, center_y, height, width). These values are normalized in [0, 1],
			
 
				+                               relative to the size of each individual image (disregarding possible padding).
			
 
				+                               See PostProcess for information on how to retrieve the unnormalized bounding box.
			
 
				+               - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
			
 
				+                                dictionnaries containing the two above keys for each decoder layer.
			
 
				+        """
			
 
				+        if isinstance(samples, (list, torch.Tensor)):
			
 
				+            samples = nested_tensor_from_tensor_list(samples)
			
 
				+        features, pos = self.backbone(samples)
			
 
				+
			
 
				+        src, mask = features[-1].decompose()
			
 
				+        assert mask is not None
			
 
				+        hs = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos[-1])[0]
			
 
				+
			
 
				+        outputs_class = self.class_embed(hs)
			
 
				+        outputs_coord = self.bbox_embed(hs).sigmoid()
			
 
				+        out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1]}
			
 
				+        if self.aux_loss:
			
 
				+            out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord)
			
 
				+        return out
			
 
				+
			
 
				+    @torch.jit.unused
			
 
				+    def _set_aux_loss(self, outputs_class, outputs_coord):
			
 
				+        # this is a workaround to make torchscript happy, as torchscript
			
 
				+        # doesn't support dictionary with non-homogeneous values, such
			
 
				+        # as a dict having both a Tensor and a list.
			
 
				+        return [{'pred_logits': a, 'pred_boxes': b}
			
 
				+                for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
			
 
				+```
			
 
				+
			
 
				+5.计算损失
			
 
				+首先把hs转化为 目标的类别
			
 
				+```
			
 
				+self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
			
 
				+self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
			
 
				+
			
 
				+class MLP(nn.Module):
			
 
				+    """ Very simple multi-layer perceptron (also called FFN)"""
			
 
				+
			
 
				+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
			
 
				+        super().__init__()
			
 
				+        self.num_layers = num_layers
			
 
				+        h = [hidden_dim] * (num_layers - 1)
			
 
				+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        for i, layer in enumerate(self.layers):
			
 
				+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
			
 
				+        return x
			
 
				+```
			
 
				+  
			
 
				+
			
 
				+```python
			
 
				+class HungarianMatcher(nn.Module):
			
 
				+    """This class computes an assignment between the targets and the predictions of the network
			
 
				+
			
 
				+    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
			
 
				+    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
			
 
				+    while the others are un-matched (and thus treated as non-objects).
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, cost_class: float = 1, cost_bbox: float = 1, cost_giou: float = 1):
			
 
				+        """Creates the matcher
			
 
				+
			
 
				+        Params:
			
 
				+            cost_class: This is the relative weight of the classification error in the matching cost
			
 
				+            cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
			
 
				+            cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
			
 
				+        """
			
 
				+        super().__init__()
			
 
				+        self.cost_class = cost_class
			
 
				+        self.cost_bbox = cost_bbox
			
 
				+        self.cost_giou = cost_giou
			
 
				+        assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0"
			
 
				+
			
 
				+    @torch.no_grad()
			
 
				+    def forward(self, outputs, targets):
			
 
				+        """ Performs the matching
			
 
				+
			
 
				+        Params:
			
 
				+            outputs: This is a dict that contains at least these entries:
			
 
				+                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
			
 
				+                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
			
 
				+
			
 
				+            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
			
 
				+                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
			
 
				+                           objects in the target) containing the class labels
			
 
				+                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
			
 
				+
			
 
				+        Returns:
			
 
				+            A list of size batch_size, containing tuples of (index_i, index_j) where:
			
 
				+                - index_i is the indices of the selected predictions (in order)
			
 
				+                - index_j is the indices of the corresponding selected targets (in order)
			
 
				+            For each batch element, it holds:
			
 
				+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
			
 
				+        """
			
 
				+        bs, num_queries = outputs["pred_logits"].shape[:2]
			
 
				+
			
 
				+        # We flatten to compute the cost matrices in a batch
			
 
				+        out_prob = outputs["pred_logits"].flatten(0, 1).softmax(-1)  # [batch_size * num_queries, num_classes]
			
 
				+        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
			
 
				+
			
 
				+        # Also concat the target labels and boxes
			
 
				+        tgt_ids = torch.cat([v["labels"] for v in targets])
			
 
				+        tgt_bbox = torch.cat([v["boxes"] for v in targets])
			
 
				+
			
 
				+        # Compute the classification cost. Contrary to the loss, we don't use the NLL,
			
 
				+        # but approximate it in 1 - proba[target class].
			
 
				+        # The 1 is a constant that doesn't change the matching, it can be ommitted.
			
 
				+        cost_class = -out_prob[:, tgt_ids]
			
 
				+
			
 
				+        # Compute the L1 cost between boxes
			
 
				+        cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
			
 
				+
			
 
				+        # Compute the giou cost betwen boxes
			
 
				+        cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
			
 
				+
			
 
				+        # Final cost matrix
			
 
				+        C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
			
 
				+        C = C.view(bs, num_queries, -1).cpu()
			
 
				+
			
 
				+        sizes = [len(v["boxes"]) for v in targets]
			
 
				+        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
			
 
				+        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
			
 
				+```
			
 
				+
			
 
				+