Source code for detectools.models.yolo

from math import ceil, floor
from typing import Any, Dict, List, Literal, Tuple, Union

import torch
import torchvision.transforms.v2.functional as F
from detectools import Task
from detectools.formats import BatchedFormats, DetectionFormat
from detectools.models.base import BaseModel
from torch import Tensor
from ultralytics.cfg import get_cfg
from ultralytics.models.yolo.detect.train import DetectionModel
from ultralytics.nn.tasks import attempt_load_one_weight
from ultralytics.utils import DEFAULT_CFG



[docs]
class YoloDetection(DetectionModel, BaseModel):
    """YOLO detection model class in detectools. This class inheriths from DetectionModel_ (Ultralytics) and BaseModel (detectools).
    Load yolo architecture from ultralytics repository. If pretrained load a pretrain model from ultralytics.

    .. _DetectionModel:
            https://docs.ultralytics.com/reference/nn/tasks/?h=detectionmodel#ultralytics.nn.tasks.DetectionModel.__init__

    Args:
        architecture (``str``, **optional**): Architecture to use to build YOLO model. Check Ultralytics availables architectures_ . Defaults to "yolov8m".
        num_classes (``int``, **optional**): Number of classes in the task. Defaults to 1.
        pretrained (``bool``, **optional**): To use pretrained weights. Defaults to True.
        confidence_thr (``float``, **optional**): Confidence score threshold to consider object as true prediction. Defaults to 0.5.
        max_detection (``int``, **optional**): Maximum number of object to predict on one image. Defaults to 300.
        nms_threshold (``float``, **optional**): IoU threshold to consider 2 boxes as overlapping for Non Max Suppression algorithm.. Defaults to 0.45.
    
    .. _architectures:
        https://docs.ultralytics.com/models/yolov8/#supported-tasks-and-modes

    Attributes:
    -----------

    Attributes:
        confidence_thr (``float``): Confidence score threshold to consider object as true prediction.
        max_detection (``int``): Maximum number of object to predict on one image.
        nms_threshold (``float``): IoU threshold to consider 2 boxes as overlapping for Non Max Suppression algorithm.
        num_classes (``int``): Number of classes. 
    
    Methods:
    -----------
    """

    confidence_thr: float = 0.5
    max_detection: int = 300
    nms_threshold: float = 0.45
    num_classes: int = 1

    def __init__(
        self,
        architecture: str = "yolov8m",
        num_classes: int = 1,
        pretrained=True,
        confidence_thr: float = 0.5,
        max_detection: int = 300,
        nms_threshold: float = 0.45,
        *args,
        **kwargs,
    ):
        
        # assert Task mode is "instance_segmentation"
        assert (
            Task.mode == "detection"
        ), f"Task mode should be 'detection' to construct YoloDetection object, got {Task.mode}"
        # build model from ultralytics config
        super().__init__(f"{architecture}.yaml", nc=num_classes, *args, **kwargs)
        self.args = get_cfg(DEFAULT_CFG)
        self.criterion = self.init_criterion()
        self.confidence_thr = confidence_thr
        self.max_detection = max_detection
        self.nms_threshold = nms_threshold
        self.num_classes = num_classes
        # load weights from ultralytics repo if pretrained
        if pretrained:
            architecture = attempt_load_one_weight(
                f"{architecture}.pt",
            )
            self.load(architecture[0])

    # override

[docs]
    def to_device(self, device: Literal["cpu", "cuda"]):
        """Send model & criterion to device.

        Args:
            device (``Literal['cpu', 'cuda']``): Device to send model on.
        """
        self.to(device)
        self.criterion = self.init_criterion()



[docs]
    def prepare_image(self, images: Tensor) -> Tuple[Tensor, Tuple[int]]:
        """Pad images if needed & return padding values.

        Args:
            images (``Tensor``): Batch_images.

        Returns:
            ``Tuple[Tensor, Tuple[int]]``:
                - Padded images.
                - Padding values.
        """
        # get borders padding
        padding_values = self.yolo_pad_requirements(images)
        # pad images
        images = F.pad(images, list(padding_values))
        return images, padding_values



[docs]
    def prepare_target(self, targets: BatchedFormats) -> Dict[str, Tensor]:
        """Transform DetectionFormat targets into yolo targets format.

        Args:
            targets (``BatchedFormats``): Batch targets.

        Returns:
            ``Dict[str, Tensor]``:
                - Targets in YOLO format.
        """
        # transform boxes
        targets.apply("set_boxes_format", "CXCYWH")
        targets.apply("normalize")
        # get values
        targets: List[DetectionFormat] = targets.split()
        boxes = torch.cat([t.get("boxes") for t in targets])
        labels = torch.cat([t.get("labels") for t in targets])
        device = labels.device
        images_indices = torch.cat(
            [torch.full((t.size,), i, device=device) for i, t in enumerate(targets)]
        )
        # reshape data to fit YoloV8detection loss
        indexes = images_indices[..., None]
        classes = labels[..., None]

        batch_targets = {"batch_idx": indexes, "cls": classes, "bboxes": boxes}

        return batch_targets


    # override

[docs]
    def prepare(
        self, images: Tensor, targets: BatchedFormats = None
    ) -> Union[Tensor, Tuple[Tensor, Dict[str, Tensor]]]:
        """Transform images and targets into YOLO specific format for prediction & loss computation.

        Args:
            images (``Tensor``): Batch images.
            targets (``BatchedFormats``, **optional**): Batched targets from DetectionDataset.

        Returns:
            ``Union[Tensor, Tuple[Tensor, Dict[str, Tensor]]]``:
                - Images data prepared for YOLO.
                - If targets: images + targets prepared for YOLO.
        """

        (left, top, right, bottom) = self.yolo_pad_requirements(images)
        # pad images & target
        images = F.pad(images, list((left, top, right, bottom)))
        if targets:
            prepared_targets = targets.clone()
            # prepare targets for yolo
            prepared_targets.apply("pad", left, top, right, bottom)
            prepared_targets = self.prepare_target(prepared_targets)
            return images, prepared_targets
        else:
            return images



[docs]
    def yolo_pad_requirements(
        self, input_object: Union[Tensor, DetectionFormat]
    ) -> List[int]:
        """Return values for padding to fit 'divisible by 32' requirement.

        Args:
            input_object (``Union[Tensor, DetectionFormat]``): Input to pad (image or DetectionFormat).

        Returns:
            ``List[int]``:
                - Padding values.
        """
        # get spatial size
        if isinstance(input_object, DetectionFormat):
            h, w = input_object.spatial_size
        elif isinstance(input_object, Tensor):
            h, w = input_object.shape[-2:]  # (H,W)
        # get pad values
        diff_h, diff_w = h % 32, w % 32
        pad_h = 32 - diff_h if diff_h > 0 else 0
        pad_w = 32 - diff_w if diff_w > 0 else 0
        # define padding for each border
        if pad_h or pad_w:
            half_h, half_w = pad_h / 2, pad_w / 2
            left, top, right, bottom = (
                ceil(half_w),
                ceil(half_h),
                floor(half_w),
                floor(half_h),
            )
        else:
            left, top, right, bottom = (0, 0, 0, 0)
        return (left, top, right, bottom)



[docs]
    def retrieve_spatial_size(self, raw_outputs: List[Tensor]) -> Tuple[int]:
        """Retrieve image shape from raw_outputs and stride values.

        Args:
            raw_outputs (``List[Tensor]``): Raw ouptuts from YOLO model.

        Returns:
            ``Tuple[int]``:
                - Size of input image (H, W).
        """
        h = int(raw_outputs[0].shape[-2] * self.stride[0])
        w = int(raw_outputs[0].shape[-1] * self.stride[0])
        return (h, w)


    # override

[docs]
    def build_results(
        self, raw_outputs: List[Tensor], prebuild_outputs: Tensor
    ) -> BatchedFormats:
        """Transform model outputs into Batch DetectionFormat for results.

        Args:
            raw_outputs (``List[Tensor]``): Model outputs.
            prebuild_outputs (``Tensor``): Extracted boxes from YOLO raw outputs.

        Returns:
            ``BatchedFormats``:
                - Batched predictions.
        """

        device = prebuild_outputs.device
        prebuild_outputs = prebuild_outputs.unbind()
        h, w = self.retrieve_spatial_size(raw_outputs)
        # create empty Format to merge batch results
        results = []
        # for each prediction
        for prediction in prebuild_outputs:
            # send pred in good pshape
            prediction = prediction.permute(1, 0)
            # get best class and corresponding score
            best_class = torch.argmax(prediction[:, 4:], dim=1)
            confidence = torch.max(prediction[:, 4:], dim=1)
            # gather box cxcywh coordinates
            boxes_coordinates = prediction[:, :4]
            # build result
            result = DetectionFormat(
                spatial_size=(h, w),
                boxes=boxes_coordinates,
                labels=best_class,
                scores=confidence.values,
                box_format="CXCYWH",
            )
            # convert boxes in coco
            result.set_boxes_format("XYWH")
            # objects selections
            result = result.confidence(self.confidence_thr)
            result = result.nms(self.nms_threshold)
            result = result.max_detections(self.max_detection)
            # stack batch results
            results.append(result)

        if len(results) == 0:
            results = DetectionFormat.empty((h, w), device=device)

        results = BatchedFormats(results)
        return results



[docs]
    def compute_loss(
        self, raw_outputs: Tensor, targets: Dict[str, Tensor]
    ) -> Dict[str, Tensor]:
        """Compute loss with predictions & targets.

        Args:
            raw_outputs (``Any``): Raw output of model.
            targets (``DetectionFormat``): Targets in YOLO format.

        Returns:
            ``Dict[str, Tensor]``:
                - Loss dict with total loss (key: "loss") & sublosses.
        """
        loss, loss_detail = self.criterion(raw_outputs, targets)
        loss_dict = {
            "loss": loss,
            "loss_box": loss_detail[0],
            "loss_cls": loss_detail[1],
            "loss_dfl": loss_detail[2],
        }
        return loss_dict


    # override

[docs]
    def run_forward(
        self,
        images: Tensor,
        targets: BatchedFormats,
        predict: bool = False,
    ) -> Union[Dict[str, Tensor], Tuple[Dict[str, Tensor], BatchedFormats]]:
        """Compute loss from images and if target passed, compute loss & return both loss dict
        and results.

        Args:
            images (``Tensor``): Batch RGB images.
            targets (``BatchedFormats``): Batch targets.
            predict (``bool``, **optional**): To return predictions or not. Defaults to False.

        Returns:
            ``Union[Dict[str, Tensor], Tuple[Dict[str, Tensor], BatchedFormats]]``:
                - Loss dict.
                - If predict: predictions.
        """
        assert predict == (
            not self.training
        ), f"Model mode should be equal to predict boolean, got {self.training} & {predict}"
        # prepare inputs
        prepared_images, prepared_targets = self.prepare(images, targets=targets)
        # run forward pass
        if self.training:
            raw_outputs = self(prepared_images)
        else:
            prebuild_output, raw_outputs = self(prepared_images)
        # compute loss
        loss_dict = self.compute_loss(raw_outputs, prepared_targets)
        # return predictions if needed
        if predict:
            predictions = self.build_results(raw_outputs, prebuild_output)
            left, top, _, _ = self.yolo_pad_requirements(images)
            h, w = images.shape[-2:]
            predictions.apply("crop", top, left, h, w)
            return loss_dict, predictions
        else:
            return loss_dict


    # override

[docs]
    def get_predictions(self, images: Tensor) -> BatchedFormats:
        """Prepare images, Apply YOLO forward pass and build results.

        Args:
            images (``Tensor``): RGB images Tensor.

        Returns:
            ``BatchedFormats``:
                - Predictions for images as BatchedFormats.
        """
        self.eval()
        # get original spatial size
        ori_h, ori_w = images.shape[-2:]
        # pad images
        images, (left, top, _, _) = self.prepare_image(images)
        # predict
        prebuild_output, raw_outputs = self(images)
        results = self.build_results(raw_outputs, prebuild_output)
        # crop to back at original spatial size
        results.apply("crop", top, left, ori_h, ori_w)

        return results