Source code for detectools.data_management

import json
from pathlib import Path
from typing import Any, Dict, List, Tuple, Union

from detectools.data.dataset import DetectionDataset
from detectools.utils import load_json, raw_cocodict
from torch.utils.data import random_split

[docs] def merge_jsons( jsonfiles: List[str], output_json: str, categories: List[Dict[str, Any]] ): """Take a list of json files paths and merge them in one respecting the images/annotations correspondance id. Args: jsonfiles (``List[str]``): List of paths of jsons to merge. output_json (``str``): Path to store merged json. categories (``List[Dict[str, Any]]``): List of categories as COCO format. """ # initialize counter and output json image_id = 0 annotation_id = 0 cocodict = raw_cocodict() cocodict["categories"] = categories # for each json file, read the file for jsonfile in jsonfiles: file = load_json(jsonfile) assert ( categories == file["categories"] ), f"given categories should match coco jsons categories, got {categories} and {file['categories']}" # for each image for image in file["images"]: # get the corresponding anns in json file image_annotations = [ a for a in file["annotations"] if a["image_id"] == image["id"] ] # define the new image id forimage and annotations image_id += 1 image["id"] = image_id cocodict["images"].append(image) # for each annotations for ann in image_annotations: # set new id annotation_id += 1 ann["id"] = annotation_id ann["image_id"] = image_id cocodict["annotations"].append(ann) # write merged json Path(output_json).write_text(json.dumps(cocodict))
[docs] def split_dataset( source_datasets: Union[str, List[str]], destination: str, proportions: Tuple[float] = (0.8, 0.2, 0.0), ) -> None: """Split one or multiple dataset folder according to split proportions and write a new dataset with the fusion same split for each datqset (train, valid, test). Args: source_datasets (``Union[str, List[str]]``): Path to ad dataset or list of dataset. destination (``str``): Path tostore subsets. proportions (``Tuple[float]``, **optional**): Proportions for train, valid & test. Defaults to (0.8, 0.2, 0.0). """ # if source_datasets is a string wrap it in list if isinstance(source_datasets, str): source_datasets = [source_datasets] # for each source dataset do the split for dataset_path in source_datasets: print(f"Splitting dataset: {dataset_path}") # create DetectionDataset dataset = DetectionDataset(dataset_path, preprocessing=None) # split dataset in 3 subsets train, valid, test = random_split(dataset, list(proportions)) subsets = list(zip([train, valid, test], ["train", "valid", "test"])) # for each subset for subset_dataset, subset_name in subsets: print(f"Export {subset_name} data:") subset_path = Path(destination) / subset_name dataset.export_dataset( f"{subset_path}", indices=subset_dataset.indices, )