import json
from pathlib import Path
from typing import Any, Dict, List, Tuple, Union
from detectools.data.dataset import DetectionDataset
from detectools.utils import load_json, raw_cocodict
from torch.utils.data import random_split
[docs]
def merge_jsons(
jsonfiles: List[str], output_json: str, categories: List[Dict[str, Any]]
):
"""Take a list of json files paths and merge them in one respecting
the images/annotations correspondance id.
Args:
jsonfiles (``List[str]``): List of paths of jsons to merge.
output_json (``str``): Path to store merged json.
categories (``List[Dict[str, Any]]``): List of categories as COCO format.
"""
# initialize counter and output json
image_id = 0
annotation_id = 0
cocodict = raw_cocodict()
cocodict["categories"] = categories
# for each json file, read the file
for jsonfile in jsonfiles:
file = load_json(jsonfile)
assert (
categories == file["categories"]
), f"given categories should match coco jsons categories, got {categories} and {file['categories']}"
# for each image
for image in file["images"]:
# get the corresponding anns in json file
image_annotations = [
a for a in file["annotations"] if a["image_id"] == image["id"]
]
# define the new image id forimage and annotations
image_id += 1
image["id"] = image_id
cocodict["images"].append(image)
# for each annotations
for ann in image_annotations:
# set new id
annotation_id += 1
ann["id"] = annotation_id
ann["image_id"] = image_id
cocodict["annotations"].append(ann)
# write merged json
Path(output_json).write_text(json.dumps(cocodict))
[docs]
def split_dataset(
source_datasets: Union[str, List[str]],
destination: str,
proportions: Tuple[float] = (0.8, 0.2, 0.0),
) -> None:
"""Split one or multiple dataset folder according to split proportions and write a new dataset
with the fusion same split for each datqset (train, valid, test).
Args:
source_datasets (``Union[str, List[str]]``): Path to ad dataset or list of dataset.
destination (``str``): Path tostore subsets.
proportions (``Tuple[float]``, **optional**): Proportions for train, valid & test. Defaults to (0.8, 0.2, 0.0).
"""
# if source_datasets is a string wrap it in list
if isinstance(source_datasets, str):
source_datasets = [source_datasets]
# for each source dataset do the split
for dataset_path in source_datasets:
print(f"Splitting dataset: {dataset_path}")
# create DetectionDataset
dataset = DetectionDataset(dataset_path, preprocessing=None)
# split dataset in 3 subsets
train, valid, test = random_split(dataset, list(proportions))
subsets = list(zip([train, valid, test], ["train", "valid", "test"]))
# for each subset
for subset_dataset, subset_name in subsets:
print(f"Export {subset_name} data:")
subset_path = Path(destination) / subset_name
dataset.export_dataset(
f"{subset_path}",
indices=subset_dataset.indices,
)