DETR3D#
- pydantic model vision_architectures.nets.detr_3d.DETR3DEncoderConfig[source]#
Bases:
Attention3DWithMLPConfig
,AbsolutePositionEmbeddings3DConfig
Show JSON schema
{ "title": "DETR3DEncoderConfig", "type": "object", "properties": { "dim": { "description": "Dimension of the input and output features.", "title": "Dim", "type": "integer" }, "grid_size": { "anyOf": [ { "maxItems": 3, "minItems": 3, "prefixItems": [ { "type": "integer" }, { "type": "integer" }, { "type": "integer" } ], "type": "array" }, { "type": "null" } ], "default": null, "title": "Grid Size" }, "learnable": { "default": false, "title": "Learnable", "type": "boolean" }, "num_heads": { "description": "Number of query heads", "title": "Num Heads", "type": "integer" }, "ratio_q_to_kv_heads": { "default": 1, "title": "Ratio Q To Kv Heads", "type": "integer" }, "logit_scale_learnable": { "default": false, "title": "Logit Scale Learnable", "type": "boolean" }, "attn_drop_prob": { "default": 0.0, "title": "Attn Drop Prob", "type": "number" }, "proj_drop_prob": { "default": 0.0, "title": "Proj Drop Prob", "type": "number" }, "max_attention_batch_size": { "default": -1, "description": "Runs attention by splitting the inputs into chunks of this size. 0 means no chunking. Useful for large inputs during inference.", "title": "Max Attention Batch Size", "type": "integer" }, "mlp_ratio": { "default": 4, "description": "Ratio of the hidden dimension in the MLP to the input dimension.", "title": "Mlp Ratio", "type": "integer" }, "activation": { "default": "gelu", "description": "Activation function for the MLP.", "title": "Activation", "type": "string" }, "mlp_drop_prob": { "default": 0.0, "description": "Dropout probability for the MLP.", "title": "Mlp Drop Prob", "type": "number" }, "norm_location": { "default": "post", "description": "Location of the normalization layer in the attention block. Pre-normalization implies normalization before the attention operation, while post-normalization applies it after.", "enum": [ "pre", "post" ], "title": "Norm Location", "type": "string" }, "layer_norm_eps": { "default": 1e-06, "description": "Epsilon value for the layer normalization.", "title": "Layer Norm Eps", "type": "number" }, "num_encoder_layers": { "description": "Number of transformer encoder layers.", "title": "Num Encoder Layers", "type": "integer" } }, "required": [ "dim", "num_heads", "num_encoder_layers" ] }
- Config:
arbitrary_types_allowed: bool = True
extra: str = ignore
validate_default: bool = True
validate_assignment: bool = True
validate_return: bool = True
- Fields:
- Validators:
-
field num_encoder_layers:
int
[Required]# Number of transformer encoder layers.
- Validated by:
- pydantic model vision_architectures.nets.detr_3d.DETR3DDecoderConfig[source]#
Bases:
Attention1DWithMLPConfig
,AbsolutePositionEmbeddings3DConfig
Show JSON schema
{ "title": "DETR3DDecoderConfig", "type": "object", "properties": { "dim": { "description": "Dimension of the input and output features.", "title": "Dim", "type": "integer" }, "grid_size": { "anyOf": [ { "maxItems": 3, "minItems": 3, "prefixItems": [ { "type": "integer" }, { "type": "integer" }, { "type": "integer" } ], "type": "array" }, { "type": "null" } ], "default": null, "title": "Grid Size" }, "learnable": { "default": false, "title": "Learnable", "type": "boolean" }, "num_heads": { "description": "Number of query heads", "title": "Num Heads", "type": "integer" }, "ratio_q_to_kv_heads": { "default": 1, "title": "Ratio Q To Kv Heads", "type": "integer" }, "logit_scale_learnable": { "default": false, "title": "Logit Scale Learnable", "type": "boolean" }, "attn_drop_prob": { "default": 0.0, "title": "Attn Drop Prob", "type": "number" }, "proj_drop_prob": { "default": 0.0, "title": "Proj Drop Prob", "type": "number" }, "max_attention_batch_size": { "default": -1, "description": "Runs attention by splitting the inputs into chunks of this size. 0 means no chunking. Useful for large inputs during inference.", "title": "Max Attention Batch Size", "type": "integer" }, "mlp_ratio": { "default": 4, "description": "Ratio of the hidden dimension in the MLP to the input dimension.", "title": "Mlp Ratio", "type": "integer" }, "activation": { "default": "gelu", "description": "Activation function for the MLP.", "title": "Activation", "type": "string" }, "mlp_drop_prob": { "default": 0.0, "description": "Dropout probability for the MLP.", "title": "Mlp Drop Prob", "type": "number" }, "norm_location": { "default": "post", "description": "Location of the normalization layer in the attention block. Pre-normalization implies normalization before the attention operation, while post-normalization applies it after.", "enum": [ "pre", "post" ], "title": "Norm Location", "type": "string" }, "layer_norm_eps": { "default": 1e-06, "description": "Epsilon value for the layer normalization.", "title": "Layer Norm Eps", "type": "number" }, "num_decoder_layers": { "description": "Number of transformer decoder layers.", "title": "Num Decoder Layers", "type": "integer" } }, "required": [ "dim", "num_heads", "num_decoder_layers" ] }
- Config:
arbitrary_types_allowed: bool = True
extra: str = ignore
validate_default: bool = True
validate_assignment: bool = True
validate_return: bool = True
- Fields:
- Validators:
-
field num_decoder_layers:
int
[Required]# Number of transformer decoder layers.
- Validated by:
- pydantic model vision_architectures.nets.detr_3d.DETRBBoxMLPConfig[source]#
Bases:
CustomBaseModel
Show JSON schema
{ "title": "DETRBBoxMLPConfig", "type": "object", "properties": { "dim": { "description": "Dimension of the input features.", "title": "Dim", "type": "integer" }, "num_classes": { "description": "Number of classes for the bounding box predictions.", "title": "Num Classes", "type": "integer" }, "activation": { "default": "relu", "description": "Activation function for the MLP.", "title": "Activation", "type": "string" }, "bbox_size_activation": { "default": "sigmoid", "description": "Activation function for the bounding box size. \"sigmoid\" for normalized coordinates, \"softplus\" for absolute coordinates.", "enum": [ "sigmoid", "softplus" ], "title": "Bbox Size Activation", "type": "string" } }, "required": [ "dim", "num_classes" ] }
- Config:
arbitrary_types_allowed: bool = True
extra: str = ignore
validate_default: bool = True
validate_assignment: bool = True
validate_return: bool = True
- Fields:
- Validators:
validate
»all fields
-
field dim:
int
[Required]# Dimension of the input features.
- Validated by:
-
field num_classes:
int
[Required]# Number of classes for the bounding box predictions.
- Validated by:
-
field activation:
str
= 'relu'# Activation function for the MLP.
- Validated by:
-
field bbox_size_activation:
Literal
['sigmoid'
,'softplus'
] = 'sigmoid'# Activation function for the bounding box size. “sigmoid” for normalized coordinates, “softplus” for absolute coordinates.
- Validated by:
- pydantic model vision_architectures.nets.detr_3d.DETR3DConfig[source]#
Bases:
DETR3DEncoderConfig
,DETR3DDecoderConfig
,DETRBBoxMLPConfig
Show JSON schema
{ "title": "DETR3DConfig", "type": "object", "properties": { "dim": { "description": "Dimension of the input and output features.", "title": "Dim", "type": "integer" }, "num_classes": { "description": "Number of classes for the bounding box predictions.", "title": "Num Classes", "type": "integer" }, "activation": { "default": "gelu", "description": "Activation function for the MLP.", "title": "Activation", "type": "string" }, "bbox_size_activation": { "default": "sigmoid", "description": "Activation function for the bounding box size. \"sigmoid\" for normalized coordinates, \"softplus\" for absolute coordinates.", "enum": [ "sigmoid", "softplus" ], "title": "Bbox Size Activation", "type": "string" }, "grid_size": { "anyOf": [ { "maxItems": 3, "minItems": 3, "prefixItems": [ { "type": "integer" }, { "type": "integer" }, { "type": "integer" } ], "type": "array" }, { "type": "null" } ], "default": null, "title": "Grid Size" }, "learnable": { "default": false, "title": "Learnable", "type": "boolean" }, "num_heads": { "description": "Number of query heads", "title": "Num Heads", "type": "integer" }, "ratio_q_to_kv_heads": { "default": 1, "title": "Ratio Q To Kv Heads", "type": "integer" }, "logit_scale_learnable": { "default": false, "title": "Logit Scale Learnable", "type": "boolean" }, "attn_drop_prob": { "default": 0.0, "title": "Attn Drop Prob", "type": "number" }, "proj_drop_prob": { "default": 0.0, "title": "Proj Drop Prob", "type": "number" }, "max_attention_batch_size": { "default": -1, "description": "Runs attention by splitting the inputs into chunks of this size. 0 means no chunking. Useful for large inputs during inference.", "title": "Max Attention Batch Size", "type": "integer" }, "mlp_ratio": { "default": 4, "description": "Ratio of the hidden dimension in the MLP to the input dimension.", "title": "Mlp Ratio", "type": "integer" }, "mlp_drop_prob": { "default": 0.0, "description": "Dropout probability for the MLP.", "title": "Mlp Drop Prob", "type": "number" }, "norm_location": { "default": "post", "description": "Location of the normalization layer in the attention block. Pre-normalization implies normalization before the attention operation, while post-normalization applies it after.", "enum": [ "pre", "post" ], "title": "Norm Location", "type": "string" }, "layer_norm_eps": { "default": 1e-06, "description": "Epsilon value for the layer normalization.", "title": "Layer Norm Eps", "type": "number" }, "num_decoder_layers": { "description": "Number of transformer decoder layers.", "title": "Num Decoder Layers", "type": "integer" }, "num_encoder_layers": { "description": "Number of transformer encoder layers.", "title": "Num Encoder Layers", "type": "integer" }, "num_objects": { "description": "Maximum number of objects to detect.", "title": "Num Objects", "type": "integer" }, "classification_cost_fn": { "default": "softmax", "description": "Method used to compute classification cost in Hungarian matching. While softmax is the default (as per the paper), log_softmax is encouraged as the final bipartite matching loss calculation uses cross entropy loss which performs log_softmax and NLLLoss internally making log_softmax more apt.", "enum": [ "softmax", "log_softmax" ], "title": "Classification Cost Fn", "type": "string" }, "classification_loss_fn": { "anyOf": [ { "enum": [ "cross_entropy", "class_balanced_cross_entropy" ], "type": "string" }, { "additionalProperties": true, "type": "object" }, { "$ref": "#/$defs/ClassBalancedCrossEntropyLossConfig" } ], "default": "cross_entropy", "description": "Loss function for bbox classification.", "title": "Classification Loss Fn" } }, "$defs": { "ClassBalancedCrossEntropyLossConfig": { "properties": { "num_classes": { "description": "Number of classes to weight cross entropy loss.", "title": "Num Classes", "type": "integer" }, "ema_decay": { "default": 0.99, "description": "Exponential moving average decay. By default 0.99 is used which has a half life of ~69 steps", "title": "Ema Decay", "type": "number" } }, "required": [ "num_classes" ], "title": "ClassBalancedCrossEntropyLossConfig", "type": "object" } }, "required": [ "dim", "num_classes", "num_heads", "num_decoder_layers", "num_encoder_layers", "num_objects" ] }
- Config:
arbitrary_types_allowed: bool = True
extra: str = ignore
validate_default: bool = True
validate_assignment: bool = True
validate_return: bool = True
- Fields:
- Validators:
validate
»all fields
-
field num_objects:
int
[Required]# Maximum number of objects to detect.
- Validated by:
-
field classification_cost_fn:
Literal
['softmax'
,'log_softmax'
] = 'softmax'# Method used to compute classification cost in Hungarian matching. While softmax is the default (as per the paper), log_softmax is encouraged as the final bipartite matching loss calculation uses cross entropy loss which performs log_softmax and NLLLoss internally making log_softmax more apt.
- Validated by:
-
field classification_loss_fn:
Union
[Literal
['cross_entropy'
,'class_balanced_cross_entropy'
],dict
,ClassBalancedCrossEntropyLossConfig
] = 'cross_entropy'# Loss function for bbox classification.
- Validated by:
- class vision_architectures.nets.detr_3d.DETR3DEncoderLayer(config={}, checkpointing_level=0, **kwargs)[source]#
Bases:
Module
DETR3D Encoder layer. This class is designed for 3D input eg. medical images, videos etc.
- __init__(config={}, checkpointing_level=0, **kwargs)[source]#
Initialize a DETR3DDecoderLayer block. Activation checkpointing level 3.
This transformer encoder layer allows modifying query and key values independent of the residual connection.
- Parameters:
config (
Attention3DWithMLPConfig
) – An instance of the Config class that contains all the configuration parameters. It can also be passed as a dictionary and the instance will be created automatically.checkpointing_level (
int
) – The level of checkpointing to use for activation checkpointing. Refer toActivationCheckpointing
for more details.**kwargs – Additional keyword arguments for configuration.
- forward(qkv, q_modifier=None, k_modifier=None, channels_first=True)[source]#
Forward pass of the Attention3DWithMLP block.
- Parameters:
qkv (
Tensor
) – Tensor of shape (B, C, Z, Y, X) or (B, Z, Y, X, C) representing the input features.q_modifier (
Optional
[Callable
]) – If provided, the query will be passed to the callable before the attention operation.k_modifier (
Optional
[Callable
]) – If provided, the key will be passed to the callable before the attention operation.channels_first (
bool
) – Whether the inputs are in channels first format (B, C, …) or not (B, …, C).
- Return type:
Tensor
- Returns:
Tensor of shape (B, C, Z, Y, X) or (B, Z, Y, X, C) representing the output features.
- class vision_architectures.nets.detr_3d.DETR3DDecoderLayer(config={}, checkpointing_level=0, **kwargs)[source]#
Bases:
Module
A cross attention transformer block. This class is designed for 3D input eg. medical images, videos etc.
- __init__(config={}, checkpointing_level=0, **kwargs)[source]#
Initialize a DETR3DDecoderLayer block. Activation checkpointing level 3.
This transformer decoder layer allows modifying query and key values independent of the residual connection.
- Parameters:
config (
Attention1DWithMLPConfig
) – An instance of the Config class that contains all the configuration parameters. It can also be passed as a dictionary and the instance will be created automatically.checkpointing_level (
int
) – The level of checkpointing to use for activation checkpointing. Refer toActivationCheckpointing
for more details.**kwargs – Additional keyword arguments for configuration.
- forward(q, kv, q1_modifier=None, k1_modifier=None, q2_modifier=None, k2_modifier=None, channels_first=True)[source]#
Forward pass of the TransformerDecoderBlock1D block.
- Parameters:
q (
Tensor
) – Tensor of shape (B, T, C) representing the input features. The query, key, and value tensor used for self-attention.kv (
Tensor
) – Tensor of shape (B, C, Z, Y, X) or (B, Z, Y, X, C) representing the input features. The key and value tensors used for cross-attention.q1_modifier (
Optional
[Callable
]) – If provided, the self-attention query tensor will be passed through the callable before the attention operation.k1_modifier (
Optional
[Callable
]) – If provided, the self-attention key tensor will be passed through the callable before the attention operation.q2_modifier (
Optional
[Callable
]) – If provided, the cross-attention query tensor will be passed through the callable before the attention operation.k2_modifier (
Optional
[Callable
]) – If provided, the cross-attention key tensor will be passed through the callable before the attention operation.channels_first (
bool
) – Whether the inputs are in channels first format (B, C, …) or not (B, …, C).
- Return type:
Tensor
- Returns:
Tensor of shape (B, C, Z, Y, X) or (B, Z, Y, X, C) representing the output features.
- class vision_architectures.nets.detr_3d.DETR3DEncoder(config={}, checkpointing_level=0, **kwargs)[source]#
Bases:
Module
,PyTorchModelHubMixin
DETR Transformer encoder.
- __init__(config={}, checkpointing_level=0, **kwargs)[source]#
Initialize the DETR3DEncoder. Activation checkpointing level 4.
- Parameters:
config (
DETR3DEncoderConfig
) – An instance of the Config class that contains all the configuration parameters. It can also be passed as a dictionary and the instance will be created automatically.checkpointing_level (
int
) – The level of checkpointing to use for activation checkpointing. Refer toActivationCheckpointing
for more details.**kwargs – Additional keyword arguments for configuration.
- forward(embeddings, spacings=None, return_intermediates=False, channels_first=True)[source]#
Forward pass of the DETR3D encoder.
- Parameters:
embeddings (
Tensor
) – Embeddings from the backbone. Tensor of shape (B, C, Z, Y, X) or (B, Z, Y, X, C) representing the input features.spacings (
Optional
[Tensor
]) – Spacing information of shape (B, 3) of the input features.return_intermediates (
bool
) – If True, also returns the outputs of all layers. Defaults to False.channels_first (
bool
) – Whether the inputs are in channels first format (B, C, …) or not (B, …, C).
- Return type:
Tensor
|tuple
[Tensor
,list
[Tensor
]]- Returns:
If return_intermediates is True, returns the final object embeddings and a list of outputs from all layers. Otherwise, returns only the final object embeddings.
- class vision_architectures.nets.detr_3d.DETR3DDecoder(config={}, checkpointing_level=0, **kwargs)[source]#
Bases:
Module
,PyTorchModelHubMixin
DETR Transformer decoder.
- __init__(config={}, checkpointing_level=0, **kwargs)[source]#
Initialize the DETR3DDecoder. Activation checkpointing level 4.
- Parameters:
config (
DETR3DDecoderConfig
) – An instance of the Config class that contains all the configuration parameters. It can also be passed as a dictionary and the instance will be created automatically.checkpointing_level (
int
) – The level of checkpointing to use for activation checkpointing. Refer toActivationCheckpointing
for more details.**kwargs – Additional keyword arguments for configuration.
- forward(object_queries, embeddings, spacings=None, return_intermediates=False, channels_first=True)[source]#
Forward pass of the DETR3D decoder.
- Parameters:
object_queries (
Tensor
) – Tokens that represent object queries. Tensor of shape (B, T, C) representing the input features.embeddings (
Tensor
) – Actual embeddings of the input. Tensor of shape (B, C, Z, Y, X) or (B, Z, Y, X, C) representing the input features.spacings (
Optional
[Tensor
]) – Spacing information of shape (B, 3) of the input features.return_intermediates (
bool
) – If True, also returns the outputs of all layers. Defaults to False.channels_first (
bool
) – Whether the inputs are in channels first format (B, C, …) or not (B, …, C).
- Return type:
Tensor
|tuple
[Tensor
,list
[Tensor
]]- Returns:
If return_intermediates is True, returns the final object embeddings and a list of outputs from all layers. Otherwise, returns only the final object embeddings.
- class vision_architectures.nets.detr_3d.DETRBBoxMLP(config={}, **kwargs)[source]#
Bases:
Module
DETR Bounding Box MLP. This module predicts bounding boxes and class scores from object query embeddings.
- __init__(config={}, **kwargs)[source]#
Initialize the DETRBBoxMLP.
- Parameters:
config (
DETRBBoxMLPConfig
) – An instance of the Config class that contains all the configuration parameters. It can also be passed as a dictionary and the instance will be created automatically.**kwargs – Additional keyword arguments for configuration.
- forward(object_embeddings)[source]#
Forward pass of the DETRBBoxMLP.
- Parameters:
object_embeddings (
Tensor
) – Object embeddings from the DETR decoder. Tensor of shape (B, T, C) representing the input features.- Return type:
Tensor
- Returns:
A tensor of shape (b, num_possible_objects, 1 objectness class + 6 bounding box parameters + num_classes) containing the predicted bounding boxes and class scores. The bounding boxes are in the format (z_center, y_center, x_center, z_size, y_size, x_size), where the centers and the sizes are normalized to the range [0, 1] corresponding to the input dimensions.
- class vision_architectures.nets.detr_3d.DETR3D(config={}, checkpointing_level=0, **kwargs)[source]#
Bases:
Module
,PyTorchModelHubMixin
DETR 3D model. Also implements bipartite matching loss which is essential for DETR training.
- __init__(config={}, checkpointing_level=0, **kwargs)[source]#
Initialize the DETR3D. Activation checkpointing level 4.
- Parameters:
config (
DETR3DConfig
) – An instance of the Config class that contains all the configuration parameters. It can also be passed as a dictionary and the instance will be created automatically.checkpointing_level (
int
) – The level of checkpointing to use for activation checkpointing. Refer toActivationCheckpointing
for more details.**kwargs – Additional keyword arguments for configuration.
- forward(embeddings, spacings=None, channels_first=True, return_intermediates=False, process_intermediates=True)[source]#
Forward pass of the DETR3D.
- Parameters:
embeddings (
Tensor
) – Input features from backbone. Tensor of shape (B, C, Z, Y, X) or (B, Z, Y, X, C) representing the input features.spacings (
Optional
[Tensor
]) – Spacing information of shape (B, 3) of the input features.channels_first (
bool
) – Whether the inputs are in channels first format (B, C, …) or not (B, …, C).return_intermediates (
bool
) – If True, also returns the outputs of all layers. Defaults to False.process_intermediates (
bool
) – If True, passes the layer outputs through the bbox_mlp too. Requires return_intermediates to be True too.
- Return type:
Tensor
|tuple
[Tensor
,Tensor
,list
[Tensor
],list
[Tensor
]]- Returns:
A tuple containing bounding boxes, object embeddings, decoder layer outputs, and encoder layer outputs if return_intermediates is True. Else, returns only the bounding boxes. The bounding boxes are in the format (z_center, y_center, x_center, z_size, y_size, x_size), where the centers and the sizes are normalized to the range [0, 1] corresponding to the input dimensions.
- bipartite_matching_loss(pred, target, intermediate_preds=None, classification_cost_weight=1.0, bbox_l1_cost_weight=1.0, bbox_giou_cost_weight=1.0, matched_indices=None, update_class_prevalences=True, reduction='mean', return_matching=False, return_loss_components=False)[source]#
Bipartite matching loss for DETR. The classes are expected to optimize for a multi-class classification problem. Expects raw logits in class predictions, not probabilities. Use
logits_to_scores_fn=None
in theforward
function to avoid applying any transformation.- Parameters:
pred (
Tensor
) – Predicted bounding boxes and class scores. It should be of shape (B, num_objects, 6 + 1 + num_classes). Number of objects and number of classes will be inferred from here.target (
Tensor
|list
[Tensor
]) – Target bounding boxes and class scores. If provided as a list, each element should be a tensor for the corresponding batch element inpred
and therefore should have a length of B. Each tensor should have less than or equal to the number of objects in pred. The number of classes can either be the exact same as in pred, or it should be 1 argmax (one-cold) decoding.intermediate_preds (
Optional
[list
[Tensor
]]) – A list of any intermediate decoder outputs to use for auxiliary losses. If provided, returned loss values become a list instead of single values. The matching used is the one used for the final prediction i.e. pred.classification_cost_weight (
float
) – Weight for the classification cost in hungarian matching. Only used when matched_indices is None.bbox_l1_cost_weight (
float
) – Weight for the bounding box L1 loss cost in hungarian matching. Only used when matched_indices is None.bbox_giou_cost_weight (
float
) – Weight for the bounding box IoU cost in hungarian matching. Only used when matched_indices is None.matched_indices (
Optional
[tuple
[list
[int
],list
[int
]]]) – Hungarian matching is done only if this is None, else this is used to match pred and target boxes. Useful when calculating auxiliary losses with intermediate layers of DETR3D.update_class_prevalences (
bool
) – Whether or not to update class prevalences in the class balanced cross entropy loss. Useful when calculating auxiliary losses with intermediate layers of DETR3D.reduction (
str
) – Specifies the reduction to apply to the output.return_matching (
bool
) – Whether or not to return the matched indices from the bipartite matching.return_loss_components (
bool
) – Whether or not to return the individual loss components.
- Return type:
Tensor
|list
[Tensor
] |tuple
[Tensor
,tuple
] |tuple
[list
[Tensor
],tuple
] |tuple
[Tensor
,dict
[str
,Tensor
]] |tuple
[list
[Tensor
],list
[dict
[str
,Tensor
]]] |tuple
[Tensor
,tuple
,dict
[str
,Tensor
]] |tuple
[list
[Tensor
],tuple
,list
[dict
[str
,Tensor
]]]- Returns:
The first element is always the main loss that has been calculated. If no intermediate_preds were provided, a tensor containing the bipartite matching loss with the shape depending on the reduction argument is present, else it is a list of the same for all preds provided with the first one being the main pred.
If return_matching is True, also returns a list of tuples containing matched indices for predictions and targets. Each tuple is of the form (pred_indices, target_indices), where pred_indices and target_indices are lists of indices for the matched predictions and targets, respectively.
If return_loss_components is True, also returns a dict of each loss component reduced based on reduction. If intermediate_preds were also provided, it is a list of the same.
- hungarian_matching(pred, target, classification_cost_weight=1.0, bbox_l1_cost_weight=1.0, bbox_giou_cost_weight=1.0)#
Hungarian matching between predictions and targets.
- Parameters:
pred (
Tensor
) – Predicted bounding boxes and class scores. It should be of shape (B, num_objects, 6 + 1 + num_classes). Number of objects and number of classes will be inferred from here.target (
list
[Tensor
]) – Target bounding boxes and class scores. This is expected in argmax encoding or one-hot encoding.classification_cost_weight (
float
) – Weight for the classification cost.bbox_l1_cost_weight (
float
) – Weight for the bounding box L1 loss cost.bbox_giou_cost_weight (
float
) – Weight for the bounding box IoU cost.
- Return type:
list
[tuple
[list
[int
],list
[int
]]]- Returns:
A list of tuples containing matched indices for predictions and targets. Each tuple is of the form (pred_indices, target_indices), where pred_indices and target_indices are lists of indices for the matched predictions and targets, respectively.
- static bbox_iou(pred_bboxes, target_bboxes)[source]#
Compute the IoU between two matched sets of bounding boxes.
- Parameters:
pred_bboxes (
Tensor
) – Predicted bounding boxes of shape (num_boxes, 6). The bounding boxes are in the format (z_center, y_center, x_center, z_size, y_size, x_size), where the centers and the sizes are normalized to the range [0, 1] corresponding to the input dimensions.target_bboxes (
Tensor
) – Target bounding boxes of shape (num_boxes, 6). The bounding boxes are in the format (z_center, y_center, x_center, z_size, y_size, x_size), where the centers and the sizes are normalized to the range [0, 1] corresponding to the input dimensions.
- Return type:
Tensor
- Returns:
A tensor containing the IoU.
- static pairwise_bbox_iou(pred_bboxes, target_bboxes)[source]#
Compute the IoUs between all combinations of predicted and target bounding boxes.
- Parameters:
pred_bboxes (
Tensor
) – Predicted bounding boxes of shape (num_objects, 6). The bounding boxes are in the format (z_center, y_center, x_center, z_size, y_size, x_size), where the centers and the sizes are normalized to the range [0, 1] corresponding to the input dimensions.target_bboxes (
Tensor
) – Target bounding boxes of shape (<=num_objects, 6). The bounding boxes are in the format (z_center, y_center, x_center, z_size, y_size, x_size), where the centers and the sizes are normalized to the range [0, 1] corresponding to the input dimensions.
- Return type:
Tensor
- Returns:
A tensor containing the IoU losses of all combinations.
- static bbox_giou(pred_bboxes, target_bboxes)[source]#
Compute the Generalized IoU between two matched sets of bounding boxes.
- Parameters:
pred_bboxes (
Tensor
) – Predicted bounding boxes of shape (num_boxes, 6). The bounding boxes are in the format (z_center, y_center, x_center, z_size, y_size, x_size), where the centers and the sizes are normalized to the range [0, 1] corresponding to the input dimensions.target_bboxes (
Tensor
) – Target bounding boxes of shape (num_boxes, 6). The bounding boxes are in the format (z_center, y_center, x_center, z_size, y_size, x_size), where the centers and the sizes are normalized to the range [0, 1] corresponding to the input dimensions.
- Return type:
Tensor
- Returns:
A tensor containing the IoU loss.
- static pairwise_bbox_giou(pred_bboxes, target_bboxes)[source]#
Compute the Generalized IoUs between all combinations of predicted and target bounding boxes.
- Parameters:
pred_bboxes (
Tensor
) – Predicted bounding boxes of shape (num_objects, 6). The bounding boxes are in the format (z_center, y_center, x_center, z_size, y_size, x_size), where the centers and the sizes are normalized to the range [0, 1] corresponding to the input dimensions.target_bboxes (
Tensor
) – Target bounding boxes of shape (<=num_objects, 6). The bounding boxes are in the format (z_center, y_center, x_center, z_size, y_size, x_size), where the centers and the sizes are normalized to the range [0, 1] corresponding to the input dimensions.
- Return type:
Tensor
- Returns:
A tensor containing the IoU losses of all combinations.