|
@@ -0,0 +1,494 @@
|
|
|
|
+1.位置向量
|
|
|
|
+```python
|
|
|
|
+class PositionEmbeddingSine(nn.Module):
|
|
|
|
+ """
|
|
|
|
+ This is a more standard version of the position embedding, very similar to the one
|
|
|
|
+ used by the Attention is all you need paper, generalized to work on images.
|
|
|
|
+ """
|
|
|
|
+ def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
|
|
|
|
+ super().__init__()
|
|
|
|
+ self.num_pos_feats = num_pos_feats
|
|
|
|
+ self.temperature = temperature
|
|
|
|
+ self.normalize = normalize
|
|
|
|
+ if scale is not None and normalize is False:
|
|
|
|
+ raise ValueError("normalize should be True if scale is passed")
|
|
|
|
+ if scale is None:
|
|
|
|
+ scale = 2 * math.pi
|
|
|
|
+ self.scale = scale
|
|
|
|
+
|
|
|
|
+ def forward(self, tensor_list: NestedTensor):
|
|
|
|
+ x = tensor_list.tensors
|
|
|
|
+ mask = tensor_list.mask
|
|
|
|
+ assert mask is not None
|
|
|
|
+ not_mask = ~mask
|
|
|
|
+ y_embed = not_mask.cumsum(1, dtype=torch.float32)
|
|
|
|
+ x_embed = not_mask.cumsum(2, dtype=torch.float32)
|
|
|
|
+ if self.normalize:
|
|
|
|
+ eps = 1e-6
|
|
|
|
+ y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
|
|
|
|
+ x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
|
|
|
|
+
|
|
|
|
+ dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
|
|
|
|
+ dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
|
|
|
|
+
|
|
|
|
+ pos_x = x_embed[:, :, :, None] / dim_t
|
|
|
|
+ pos_y = y_embed[:, :, :, None] / dim_t
|
|
|
|
+ pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
|
|
|
|
+ pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
|
|
|
|
+ pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
|
|
|
|
+ return pos
|
|
|
|
+```
|
|
|
|
+2.构造resnet提取cnn特征
|
|
|
|
+```python
|
|
|
|
+class Backbone(BackboneBase):
|
|
|
|
+ """ResNet backbone with frozen BatchNorm."""
|
|
|
|
+ def __init__(self, name: str,
|
|
|
|
+ train_backbone: bool,
|
|
|
|
+ return_interm_layers: bool,
|
|
|
|
+ dilation: bool):
|
|
|
|
+ backbone = getattr(torchvision.models, name)(
|
|
|
|
+ replace_stride_with_dilation=[False, False, dilation],
|
|
|
|
+ pretrained=is_main_process(), norm_layer=FrozenBatchNorm2d)
|
|
|
|
+ num_channels = 512 if name in ('resnet18', 'resnet34') else 2048
|
|
|
|
+ super().__init__(backbone, train_backbone, num_channels, return_interm_layers)
|
|
|
|
+```
|
|
|
|
+
|
|
|
|
+3.transformer使用multihead_attention进行第二次编码,然后继续使用multihead_attention进行解码
|
|
|
|
+
|
|
|
|
+整体流程如下
|
|
|
|
+```python
|
|
|
|
+class Transformer(nn.Module):
|
|
|
|
+
|
|
|
|
+ def __init__(self, d_model=512, nhead=8, num_encoder_layers=6,
|
|
|
|
+ num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,
|
|
|
|
+ activation="relu", normalize_before=False,
|
|
|
|
+ return_intermediate_dec=False):
|
|
|
|
+ super().__init__()
|
|
|
|
+
|
|
|
|
+ encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward,
|
|
|
|
+ dropout, activation, normalize_before)
|
|
|
|
+ encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
|
|
|
|
+ self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
|
|
|
|
+
|
|
|
|
+ decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward,
|
|
|
|
+ dropout, activation, normalize_before)
|
|
|
|
+ decoder_norm = nn.LayerNorm(d_model)
|
|
|
|
+ self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm,
|
|
|
|
+ return_intermediate=return_intermediate_dec)
|
|
|
|
+
|
|
|
|
+ self._reset_parameters()
|
|
|
|
+
|
|
|
|
+ self.d_model = d_model
|
|
|
|
+ self.nhead = nhead
|
|
|
|
+
|
|
|
|
+ def _reset_parameters(self):
|
|
|
|
+ for p in self.parameters():
|
|
|
|
+ if p.dim() > 1:
|
|
|
|
+ nn.init.xavier_uniform_(p)
|
|
|
|
+
|
|
|
|
+ def forward(self, src, mask, query_embed, pos_embed):
|
|
|
|
+ # flatten NxCxHxW to HWxNxC
|
|
|
|
+ bs, c, h, w = src.shape
|
|
|
|
+ src = src.flatten(2).permute(2, 0, 1)
|
|
|
|
+ pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
|
|
|
|
+ query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)
|
|
|
|
+ mask = mask.flatten(1)
|
|
|
|
+
|
|
|
|
+ tgt = torch.zeros_like(query_embed)
|
|
|
|
+ memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)
|
|
|
|
+ hs = self.decoder(tgt, memory, memory_key_padding_mask=mask,
|
|
|
|
+ pos=pos_embed, query_pos=query_embed)
|
|
|
|
+ return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w)
|
|
|
|
+
|
|
|
|
+```
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+第二次编码过程如下
|
|
|
|
+```python
|
|
|
|
+class TransformerEncoderLayer(nn.Module):
|
|
|
|
+
|
|
|
|
+ def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
|
|
|
|
+ activation="relu", normalize_before=False):
|
|
|
|
+ super().__init__()
|
|
|
|
+ self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
|
|
|
|
+ # Implementation of Feedforward model
|
|
|
|
+ self.linear1 = nn.Linear(d_model, dim_feedforward)
|
|
|
|
+ self.dropout = nn.Dropout(dropout)
|
|
|
|
+ self.linear2 = nn.Linear(dim_feedforward, d_model)
|
|
|
|
+
|
|
|
|
+ self.norm1 = nn.LayerNorm(d_model)
|
|
|
|
+ self.norm2 = nn.LayerNorm(d_model)
|
|
|
|
+ self.dropout1 = nn.Dropout(dropout)
|
|
|
|
+ self.dropout2 = nn.Dropout(dropout)
|
|
|
|
+
|
|
|
|
+ self.activation = _get_activation_fn(activation)
|
|
|
|
+ self.normalize_before = normalize_before
|
|
|
|
+
|
|
|
|
+ def with_pos_embed(self, tensor, pos: Optional[Tensor]):
|
|
|
|
+ return tensor if pos is None else tensor + pos
|
|
|
|
+
|
|
|
|
+ def forward_post(self,
|
|
|
|
+ src,
|
|
|
|
+ src_mask: Optional[Tensor] = None,
|
|
|
|
+ src_key_padding_mask: Optional[Tensor] = None,
|
|
|
|
+ pos: Optional[Tensor] = None):
|
|
|
|
+ q = k = self.with_pos_embed(src, pos)
|
|
|
|
+ src2 = self.self_attn(q, k, value=src, attn_mask=src_mask,
|
|
|
|
+ key_padding_mask=src_key_padding_mask)[0]
|
|
|
|
+ src = src + self.dropout1(src2)
|
|
|
|
+ src = self.norm1(src)
|
|
|
|
+ src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
|
|
|
|
+ src = src + self.dropout2(src2)
|
|
|
|
+ src = self.norm2(src)
|
|
|
|
+ return src
|
|
|
|
+
|
|
|
|
+ def forward_pre(self, src,
|
|
|
|
+ src_mask: Optional[Tensor] = None,
|
|
|
|
+ src_key_padding_mask: Optional[Tensor] = None,
|
|
|
|
+ pos: Optional[Tensor] = None):
|
|
|
|
+ src2 = self.norm1(src)
|
|
|
|
+ q = k = self.with_pos_embed(src2, pos)
|
|
|
|
+ src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask,
|
|
|
|
+ key_padding_mask=src_key_padding_mask)[0]
|
|
|
|
+ src = src + self.dropout1(src2)
|
|
|
|
+ src2 = self.norm2(src)
|
|
|
|
+ src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
|
|
|
|
+ src = src + self.dropout2(src2)
|
|
|
|
+ return src
|
|
|
|
+
|
|
|
|
+ def forward(self, src,
|
|
|
|
+ src_mask: Optional[Tensor] = None,
|
|
|
|
+ src_key_padding_mask: Optional[Tensor] = None,
|
|
|
|
+ pos: Optional[Tensor] = None):
|
|
|
|
+ if self.normalize_before:
|
|
|
|
+ return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
|
|
|
|
+ return self.forward_post(src, src_mask, src_key_padding_mask, pos)
|
|
|
|
+```
|
|
|
|
+
|
|
|
|
+```python
|
|
|
|
+class TransformerEncoder(nn.Module):
|
|
|
|
+
|
|
|
|
+ def __init__(self, encoder_layer, num_layers, norm=None):
|
|
|
|
+ super().__init__()
|
|
|
|
+ self.layers = _get_clones(encoder_layer, num_layers)
|
|
|
|
+ self.num_layers = num_layers
|
|
|
|
+ self.norm = norm
|
|
|
|
+
|
|
|
|
+ def forward(self, src,
|
|
|
|
+ mask: Optional[Tensor] = None,
|
|
|
|
+ src_key_padding_mask: Optional[Tensor] = None,
|
|
|
|
+ pos: Optional[Tensor] = None):
|
|
|
|
+ output = src
|
|
|
|
+
|
|
|
|
+ for layer in self.layers:
|
|
|
|
+ output = layer(output, src_mask=mask,
|
|
|
|
+ src_key_padding_mask=src_key_padding_mask, pos=pos)
|
|
|
|
+
|
|
|
|
+ if self.norm is not None:
|
|
|
|
+ output = self.norm(output)
|
|
|
|
+
|
|
|
|
+ return output
|
|
|
|
+```
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+编码细节
|
|
|
|
+```python
|
|
|
|
+q = k = self.with_pos_embed(src2, pos)
|
|
|
|
+src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask,
|
|
|
|
+ key_padding_mask=src_key_padding_mask)[0]
|
|
|
|
+```
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+解码过程如下
|
|
|
|
+```python
|
|
|
|
+class TransformerDecoderLayer(nn.Module):
|
|
|
|
+
|
|
|
|
+ def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
|
|
|
|
+ activation="relu", normalize_before=False):
|
|
|
|
+ super().__init__()
|
|
|
|
+ self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
|
|
|
|
+ self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
|
|
|
|
+ # Implementation of Feedforward model
|
|
|
|
+ self.linear1 = nn.Linear(d_model, dim_feedforward)
|
|
|
|
+ self.dropout = nn.Dropout(dropout)
|
|
|
|
+ self.linear2 = nn.Linear(dim_feedforward, d_model)
|
|
|
|
+
|
|
|
|
+ self.norm1 = nn.LayerNorm(d_model)
|
|
|
|
+ self.norm2 = nn.LayerNorm(d_model)
|
|
|
|
+ self.norm3 = nn.LayerNorm(d_model)
|
|
|
|
+ self.dropout1 = nn.Dropout(dropout)
|
|
|
|
+ self.dropout2 = nn.Dropout(dropout)
|
|
|
|
+ self.dropout3 = nn.Dropout(dropout)
|
|
|
|
+
|
|
|
|
+ self.activation = _get_activation_fn(activation)
|
|
|
|
+ self.normalize_before = normalize_before
|
|
|
|
+
|
|
|
|
+ def with_pos_embed(self, tensor, pos: Optional[Tensor]):
|
|
|
|
+ return tensor if pos is None else tensor + pos
|
|
|
|
+
|
|
|
|
+ def forward_post(self, tgt, memory,
|
|
|
|
+ tgt_mask: Optional[Tensor] = None,
|
|
|
|
+ memory_mask: Optional[Tensor] = None,
|
|
|
|
+ tgt_key_padding_mask: Optional[Tensor] = None,
|
|
|
|
+ memory_key_padding_mask: Optional[Tensor] = None,
|
|
|
|
+ pos: Optional[Tensor] = None,
|
|
|
|
+ query_pos: Optional[Tensor] = None):
|
|
|
|
+ q = k = self.with_pos_embed(tgt, query_pos)
|
|
|
|
+ tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask,
|
|
|
|
+ key_padding_mask=tgt_key_padding_mask)[0]
|
|
|
|
+ tgt = tgt + self.dropout1(tgt2)
|
|
|
|
+ tgt = self.norm1(tgt)
|
|
|
|
+ tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
|
|
|
|
+ key=self.with_pos_embed(memory, pos),
|
|
|
|
+ value=memory, attn_mask=memory_mask,
|
|
|
|
+ key_padding_mask=memory_key_padding_mask)[0]
|
|
|
|
+ tgt = tgt + self.dropout2(tgt2)
|
|
|
|
+ tgt = self.norm2(tgt)
|
|
|
|
+ tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
|
|
|
|
+ tgt = tgt + self.dropout3(tgt2)
|
|
|
|
+ tgt = self.norm3(tgt)
|
|
|
|
+ return tgt
|
|
|
|
+
|
|
|
|
+ def forward_pre(self, tgt, memory,
|
|
|
|
+ tgt_mask: Optional[Tensor] = None,
|
|
|
|
+ memory_mask: Optional[Tensor] = None,
|
|
|
|
+ tgt_key_padding_mask: Optional[Tensor] = None,
|
|
|
|
+ memory_key_padding_mask: Optional[Tensor] = None,
|
|
|
|
+ pos: Optional[Tensor] = None,
|
|
|
|
+ query_pos: Optional[Tensor] = None):
|
|
|
|
+ tgt2 = self.norm1(tgt)
|
|
|
|
+ q = k = self.with_pos_embed(tgt2, query_pos)
|
|
|
|
+ tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
|
|
|
|
+ key_padding_mask=tgt_key_padding_mask)[0]
|
|
|
|
+ tgt = tgt + self.dropout1(tgt2)
|
|
|
|
+ tgt2 = self.norm2(tgt)
|
|
|
|
+ tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
|
|
|
|
+ key=self.with_pos_embed(memory, pos),
|
|
|
|
+ value=memory, attn_mask=memory_mask,
|
|
|
|
+ key_padding_mask=memory_key_padding_mask)[0]
|
|
|
|
+ tgt = tgt + self.dropout2(tgt2)
|
|
|
|
+ tgt2 = self.norm3(tgt)
|
|
|
|
+ tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
|
|
|
|
+ tgt = tgt + self.dropout3(tgt2)
|
|
|
|
+ return tgt
|
|
|
|
+
|
|
|
|
+ def forward(self, tgt, memory,
|
|
|
|
+ tgt_mask: Optional[Tensor] = None,
|
|
|
|
+ memory_mask: Optional[Tensor] = None,
|
|
|
|
+ tgt_key_padding_mask: Optional[Tensor] = None,
|
|
|
|
+ memory_key_padding_mask: Optional[Tensor] = None,
|
|
|
|
+ pos: Optional[Tensor] = None,
|
|
|
|
+ query_pos: Optional[Tensor] = None):
|
|
|
|
+ if self.normalize_before:
|
|
|
|
+ return self.forward_pre(tgt, memory, tgt_mask, memory_mask,
|
|
|
|
+ tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
|
|
|
|
+ return self.forward_post(tgt, memory, tgt_mask, memory_mask,
|
|
|
|
+ tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
|
|
|
|
+
|
|
|
|
+```
|
|
|
|
+
|
|
|
|
+```python
|
|
|
|
+class TransformerDecoder(nn.Module):
|
|
|
|
+
|
|
|
|
+ def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
|
|
|
|
+ super().__init__()
|
|
|
|
+ self.layers = _get_clones(decoder_layer, num_layers)
|
|
|
|
+ self.num_layers = num_layers
|
|
|
|
+ self.norm = norm
|
|
|
|
+ self.return_intermediate = return_intermediate
|
|
|
|
+
|
|
|
|
+ def forward(self, tgt, memory,
|
|
|
|
+ tgt_mask: Optional[Tensor] = None,
|
|
|
|
+ memory_mask: Optional[Tensor] = None,
|
|
|
|
+ tgt_key_padding_mask: Optional[Tensor] = None,
|
|
|
|
+ memory_key_padding_mask: Optional[Tensor] = None,
|
|
|
|
+ pos: Optional[Tensor] = None,
|
|
|
|
+ query_pos: Optional[Tensor] = None):
|
|
|
|
+ output = tgt
|
|
|
|
+
|
|
|
|
+ intermediate = []
|
|
|
|
+
|
|
|
|
+ for layer in self.layers:
|
|
|
|
+ output = layer(output, memory, tgt_mask=tgt_mask,
|
|
|
|
+ memory_mask=memory_mask,
|
|
|
|
+ tgt_key_padding_mask=tgt_key_padding_mask,
|
|
|
|
+ memory_key_padding_mask=memory_key_padding_mask,
|
|
|
|
+ pos=pos, query_pos=query_pos)
|
|
|
|
+ if self.return_intermediate:
|
|
|
|
+ intermediate.append(self.norm(output))
|
|
|
|
+
|
|
|
|
+ if self.norm is not None:
|
|
|
|
+ output = self.norm(output)
|
|
|
|
+ if self.return_intermediate:
|
|
|
|
+ intermediate.pop()
|
|
|
|
+ intermediate.append(output)
|
|
|
|
+
|
|
|
|
+ if self.return_intermediate:
|
|
|
|
+ return torch.stack(intermediate)
|
|
|
|
+
|
|
|
|
+ return output.unsqueeze(0)
|
|
|
|
+```
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+4.前向计算
|
|
|
|
+```python
|
|
|
|
+class DETR(nn.Module):
|
|
|
|
+ """ This is the DETR module that performs object detection """
|
|
|
|
+ def __init__(self, backbone, transformer, num_classes, num_queries, aux_loss=False):
|
|
|
|
+ """ Initializes the model.
|
|
|
|
+ Parameters:
|
|
|
|
+ backbone: torch module of the backbone to be used. See backbone.py
|
|
|
|
+ transformer: torch module of the transformer architecture. See transformer.py
|
|
|
|
+ num_classes: number of object classes
|
|
|
|
+ num_queries: number of object queries, ie detection slot. This is the maximal number of objects
|
|
|
|
+ DETR can detect in a single image. For COCO, we recommend 100 queries.
|
|
|
|
+ aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
|
|
|
|
+ """
|
|
|
|
+ super().__init__()
|
|
|
|
+ self.num_queries = num_queries
|
|
|
|
+ self.transformer = transformer
|
|
|
|
+ hidden_dim = transformer.d_model
|
|
|
|
+ self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
|
|
|
|
+ self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
|
|
|
|
+ self.query_embed = nn.Embedding(num_queries, hidden_dim)
|
|
|
|
+ self.input_proj = nn.Conv2d(backbone.num_channels, hidden_dim, kernel_size=1)
|
|
|
|
+ self.backbone = backbone
|
|
|
|
+ self.aux_loss = aux_loss
|
|
|
|
+
|
|
|
|
+ def forward(self, samples: NestedTensor):
|
|
|
|
+ """ The forward expects a NestedTensor, which consists of:
|
|
|
|
+ - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
|
|
|
|
+ - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
|
|
|
|
+
|
|
|
|
+ It returns a dict with the following elements:
|
|
|
|
+ - "pred_logits": the classification logits (including no-object) for all queries.
|
|
|
|
+ Shape= [batch_size x num_queries x (num_classes + 1)]
|
|
|
|
+ - "pred_boxes": The normalized boxes coordinates for all queries, represented as
|
|
|
|
+ (center_x, center_y, height, width). These values are normalized in [0, 1],
|
|
|
|
+ relative to the size of each individual image (disregarding possible padding).
|
|
|
|
+ See PostProcess for information on how to retrieve the unnormalized bounding box.
|
|
|
|
+ - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
|
|
|
|
+ dictionnaries containing the two above keys for each decoder layer.
|
|
|
|
+ """
|
|
|
|
+ if isinstance(samples, (list, torch.Tensor)):
|
|
|
|
+ samples = nested_tensor_from_tensor_list(samples)
|
|
|
|
+ features, pos = self.backbone(samples)
|
|
|
|
+
|
|
|
|
+ src, mask = features[-1].decompose()
|
|
|
|
+ assert mask is not None
|
|
|
|
+ hs = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos[-1])[0]
|
|
|
|
+
|
|
|
|
+ outputs_class = self.class_embed(hs)
|
|
|
|
+ outputs_coord = self.bbox_embed(hs).sigmoid()
|
|
|
|
+ out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1]}
|
|
|
|
+ if self.aux_loss:
|
|
|
|
+ out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord)
|
|
|
|
+ return out
|
|
|
|
+
|
|
|
|
+ @torch.jit.unused
|
|
|
|
+ def _set_aux_loss(self, outputs_class, outputs_coord):
|
|
|
|
+ # this is a workaround to make torchscript happy, as torchscript
|
|
|
|
+ # doesn't support dictionary with non-homogeneous values, such
|
|
|
|
+ # as a dict having both a Tensor and a list.
|
|
|
|
+ return [{'pred_logits': a, 'pred_boxes': b}
|
|
|
|
+ for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
|
|
|
|
+```
|
|
|
|
+
|
|
|
|
+5.计算损失
|
|
|
|
+首先把hs转化为 目标的类别
|
|
|
|
+```
|
|
|
|
+self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
|
|
|
|
+self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
|
|
|
|
+
|
|
|
|
+class MLP(nn.Module):
|
|
|
|
+ """ Very simple multi-layer perceptron (also called FFN)"""
|
|
|
|
+
|
|
|
|
+ def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
|
|
|
|
+ super().__init__()
|
|
|
|
+ self.num_layers = num_layers
|
|
|
|
+ h = [hidden_dim] * (num_layers - 1)
|
|
|
|
+ self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
|
|
|
|
+
|
|
|
|
+ def forward(self, x):
|
|
|
|
+ for i, layer in enumerate(self.layers):
|
|
|
|
+ x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
|
|
|
|
+ return x
|
|
|
|
+```
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+```python
|
|
|
|
+class HungarianMatcher(nn.Module):
|
|
|
|
+ """This class computes an assignment between the targets and the predictions of the network
|
|
|
|
+
|
|
|
|
+ For efficiency reasons, the targets don't include the no_object. Because of this, in general,
|
|
|
|
+ there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
|
|
|
|
+ while the others are un-matched (and thus treated as non-objects).
|
|
|
|
+ """
|
|
|
|
+
|
|
|
|
+ def __init__(self, cost_class: float = 1, cost_bbox: float = 1, cost_giou: float = 1):
|
|
|
|
+ """Creates the matcher
|
|
|
|
+
|
|
|
|
+ Params:
|
|
|
|
+ cost_class: This is the relative weight of the classification error in the matching cost
|
|
|
|
+ cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
|
|
|
|
+ cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
|
|
|
|
+ """
|
|
|
|
+ super().__init__()
|
|
|
|
+ self.cost_class = cost_class
|
|
|
|
+ self.cost_bbox = cost_bbox
|
|
|
|
+ self.cost_giou = cost_giou
|
|
|
|
+ assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0"
|
|
|
|
+
|
|
|
|
+ @torch.no_grad()
|
|
|
|
+ def forward(self, outputs, targets):
|
|
|
|
+ """ Performs the matching
|
|
|
|
+
|
|
|
|
+ Params:
|
|
|
|
+ outputs: This is a dict that contains at least these entries:
|
|
|
|
+ "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
|
|
|
|
+ "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
|
|
|
|
+
|
|
|
|
+ targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
|
|
|
|
+ "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
|
|
|
|
+ objects in the target) containing the class labels
|
|
|
|
+ "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
|
|
|
|
+
|
|
|
|
+ Returns:
|
|
|
|
+ A list of size batch_size, containing tuples of (index_i, index_j) where:
|
|
|
|
+ - index_i is the indices of the selected predictions (in order)
|
|
|
|
+ - index_j is the indices of the corresponding selected targets (in order)
|
|
|
|
+ For each batch element, it holds:
|
|
|
|
+ len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
|
|
|
|
+ """
|
|
|
|
+ bs, num_queries = outputs["pred_logits"].shape[:2]
|
|
|
|
+
|
|
|
|
+ # We flatten to compute the cost matrices in a batch
|
|
|
|
+ out_prob = outputs["pred_logits"].flatten(0, 1).softmax(-1) # [batch_size * num_queries, num_classes]
|
|
|
|
+ out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4]
|
|
|
|
+
|
|
|
|
+ # Also concat the target labels and boxes
|
|
|
|
+ tgt_ids = torch.cat([v["labels"] for v in targets])
|
|
|
|
+ tgt_bbox = torch.cat([v["boxes"] for v in targets])
|
|
|
|
+
|
|
|
|
+ # Compute the classification cost. Contrary to the loss, we don't use the NLL,
|
|
|
|
+ # but approximate it in 1 - proba[target class].
|
|
|
|
+ # The 1 is a constant that doesn't change the matching, it can be ommitted.
|
|
|
|
+ cost_class = -out_prob[:, tgt_ids]
|
|
|
|
+
|
|
|
|
+ # Compute the L1 cost between boxes
|
|
|
|
+ cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
|
|
|
|
+
|
|
|
|
+ # Compute the giou cost betwen boxes
|
|
|
|
+ cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
|
|
|
|
+
|
|
|
|
+ # Final cost matrix
|
|
|
|
+ C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
|
|
|
|
+ C = C.view(bs, num_queries, -1).cpu()
|
|
|
|
+
|
|
|
|
+ sizes = [len(v["boxes"]) for v in targets]
|
|
|
|
+ indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
|
|
|
|
+ return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
|
|
|
|
+```
|
|
|
|
+
|
|
|
|
+
|