diff --git a/tools/dataset_preparation/__init__.py b/tools/dataset_preparation/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tools/dataset_preparation/dataset/__init__.py b/tools/dataset_preparation/dataset/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tools/dataset_preparation/dataset/base/__init__.py b/tools/dataset_preparation/dataset/base/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tools/dataset_preparation/dataset/base/dataset_preparation_base.py b/tools/dataset_preparation/dataset/base/dataset_preparation_base.py new file mode 100644 index 00000000..2e685caa --- /dev/null +++ b/tools/dataset_preparation/dataset/base/dataset_preparation_base.py @@ -0,0 +1,39 @@ +from abc import ABC +from pathlib import Path +from typing import Any, Dict + +import mmengine + + +class DatasetPreparationBase: + + def __init__(self, root_path: Path, config: Any, info_save_path: Path, info_version: str) -> None: + """ + Base class of dataset prepation. + :param root_path: Root path that contains data. + :param config: Configuration for the dataset prepration. + :param info_save_path: Path to save a dictionary of dataset information. + :param info_version: Version name for dataset information. + """ + self.root_path = root_path + self.config = config + self.info_save_path = info_save_path + self.info_version = info_version + + # Make the output path + self.info_save_path.mkdirs(exist_ok=True, parents=True) + + def run(self) -> None: + """ + Run dataset preparation to convert dataset to corresponding info format. + """ + raise NotImplementedError + + def save_info_file(self, info: Dict[str, Any], info_file_name: str) -> None: + """ + Save a dictionary of datasets information to pickle file that is used by downstream tasks later. + :param info: Selected info from datasets. + :param info_file_name: Info output file name. + """ + info_file_save_path = self.info_save_path / info_file_name + mmengine.dump(info, info_file_save_path) diff --git a/tools/dataset_preparation/dataset/t4dataset/__init__.py b/tools/dataset_preparation/dataset/t4dataset/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tools/dataset_preparation/dataset/t4dataset/t4dataset_detection3d_preparation.py b/tools/dataset_preparation/dataset/t4dataset/t4dataset_detection3d_preparation.py new file mode 100644 index 00000000..8abfd319 --- /dev/null +++ b/tools/dataset_preparation/dataset/t4dataset/t4dataset_detection3d_preparation.py @@ -0,0 +1,49 @@ +from pathlib import Path +from typing import Any, Dict, List + +from t4_devkit import Tier4 + +from tools.dataset_preparation.dataset.t4dataset.t4dataset_preparation_base import T4DatasetPreparationBase +from tools.detection3d.create_data_t4dataset import get_info + + +class T4DatasetDetection3DPreparation(T4DatasetPreparationBase): + + def __init__( + self, + root_path: Path, + config: Any, + info_save_path: Path, + info_version: str, + max_sweeps: int, + use_available_dataset_version: bool = False, + ) -> None: + """ + Base class of dataset prepation. + :param config: Configuration for the dataset prepration. + """ + super(T4DatasetDetection3DPreparation, self).__init__( + root_path=root_path, + config=config, + info_save_path=info_save_path, + info_version=info_version, + use_available_dataset_version=use_available_dataset_version, + ) + self._max_sweeps = max_sweeps + + def process_t4dataset(self, t4_dataset: Tier4) -> Dict[str, Any]: + """ + Process a t4dataset and prepare it usable format to the AWML framework. + :return: A dict of {split_name: list of t4dataset frames}. + """ + infos = {} + for i, sample in enumerate(t4_dataset.sample): + infos[i] = get_info(cfg=self.config, t4=t4_dataset, sample=sample, i=i, max_sweeps=self._max_sweeps) + return infos + + def extract_metainfo(self) -> Dict[str, Any]: + """ + Extract metainfo. + :return A dict of metainfo about the data prepration. + """ + return {"version": self.info_version, "task_name": "3d_detection", "classes": self.config.class_names} diff --git a/tools/dataset_preparation/dataset/t4dataset/t4dataset_preparation_base.py b/tools/dataset_preparation/dataset/t4dataset/t4dataset_preparation_base.py new file mode 100644 index 00000000..a94421d7 --- /dev/null +++ b/tools/dataset_preparation/dataset/t4dataset/t4dataset_preparation_base.py @@ -0,0 +1,114 @@ +from pathlib import Path +from typing import Any, Dict, List + +import yaml +from mmengine.logging import print_log +from t4_devkit import Tier4 + +from tools.dataset_preparation.dataset.base.dataset_preparation_base import DatasetPreparationBase +from tools.dataset_preparation.enum import DatasetInfoSplitKey +from tools.detection3d.create_data_t4dataset import get_scene_root_dir_path + + +class T4DatasetPreparationBase(DatasetPreparationBase): + + def __init__( + self, + root_path: Path, + config: Any, + info_save_path: Path, + info_version: str, + use_available_dataset_version: bool = False, + ) -> None: + """ + Base class of dataset prepation. + :param config: Configuration for the dataset prepration. + """ + super(T4DatasetPreparationBase, self).__init__( + root_path=root_path, config=config, info_save_path=info_save_path, info_version=info_version + ) + self.use_available_dataset_version = use_available_dataset_version + self.t4dataset_info_file_template = "t4dataset_{}_infos_{}.pkl" + + def process_t4dataset(self, t4_dataset: Tier4) -> Dict[str, Any]: + """ + Process a t4dataset and prepare it usable format to the AWML framework. + :param t4_dataset: Tier4 data object for a t4dataset. + :return: A dict of {frame identifier: frame data}. + """ + # For the base case, it does nothing. + raise NotImplementedError + + def save_t4_info_file(self, info: Dict[str, Any], split_name: str): + """ + Save t4 infos to a file. + :param infos: Selected T4 info. + """ + info_split_file_name = self.t4dataset_info_file_template.format(self.info_version, split_name) + self.save_info_file(info=info, info_file_name=info_split_file_name) + + def extract_metainfo(self) -> Dict[str, Any]: + """ + Extract metainfo. + """ + return {} + + def run( + self, + ) -> None: + """ + Run dataset preparation to convert dataset to corresponding info format. + """ + data_info = { + DatasetInfoSplitKey.TRAIN: [], + DatasetInfoSplitKey.VAL: [], + DatasetInfoSplitKey.TEST: [], + } + metainfo = self.extract_metainfo() + + for dataset_version in self.config.dataset_version_list: + dataset_list = Path(self.config.dataset_version_config_root) / (dataset_version + ".yaml") + with open(dataset_list, "r") as f: + dataset_list_dict: Dict[str, List[str]] = yaml.safe_load(f) + + for split in [DatasetInfoSplitKey.TRAIN, DatasetInfoSplitKey.VAL, DatasetInfoSplitKey.TEST]: + print_log(f"Creating data info for split: {split}", logger="current") + for scene_id in dataset_list_dict.get(split, []): + print_log(f"Creating data info for scene: {scene_id}") + + t4_dataset_id, t4_dataset_version_id = scene_id.split("/") + scene_root_dir_path = ( + Path(self.root_path) / dataset_version / t4_dataset_id / t4_dataset_version_id + ) + if not scene_root_dir_path.exists(): + if self.use_available_dataset_version: + print_log( + "Warning: The version of the dataset specified in the config file does not exist. " + "Will use whatever is available locally." + ) + scene_root_dir_path = get_scene_root_dir_path( + self.root_path, dataset_version, t4_dataset_id + ) + else: + raise ValueError(f"{scene_root_dir_path} does not exist.") + + t4_dataset = Tier4( + version="annotation", + data_root=scene_root_dir_path, + verbose=False, + ) + + info = self.process_t4dataset(t4_dataset=t4_dataset) + + data_info[split].extend(info.values()) + + info_pairs = { + DatasetInfoSplitKey.TRAIN: data_info[DatasetInfoSplitKey.TRAIN], + DatasetInfoSplitKey.VAL: data_info[DatasetInfoSplitKey.VAL], + DatasetInfoSplitKey.TEST: data_info[DatasetInfoSplitKey.TEST], + DatasetInfoSplitKey.TRAIN_VAL: data_info[DatasetInfoSplitKey.TRAIN] + data_info[DatasetInfoSplitKey.VAL], + DatasetInfoSplitKey.ALL: data_info, + } + for split_name, info in info_pairs.items(): + format_info = {"data_list": info, "metainfo": metainfo} + self.save_t4_info_file(info=format_info, split_name=split_name) diff --git a/tools/dataset_preparation/dataset_preparation.py b/tools/dataset_preparation/dataset_preparation.py new file mode 100644 index 00000000..b54e4bda --- /dev/null +++ b/tools/dataset_preparation/dataset_preparation.py @@ -0,0 +1,103 @@ +"""Script to convert dataset to info pickles.""" + +import argparse +from pathlib import Path +from typing import Any + +from mmengine.config import Config +from mmengine.logging import print_log + +from tools.dataset_preparation.dataset.base.dataset_preparation_base import DatasetPreparationBase +from tools.dataset_preparation.dataset.t4dataset.t4dataset_detection3d_preparation import ( + T4DatasetDetection3DPreparation, +) +from tools.dataset_preparation.enum import DatasetTask + + +def parse_args(): + parser = argparse.ArgumentParser(description="Create data info for T4dataset") + parser.add_argument( + "--task", + choices=["t4_detection3d", "t4_detection2d", "t4_classification2d"], + help="Choose a task for data preparation.", + ) + parser.add_argument( + "--config", + type=str, + required=True, + help="config for T4dataset", + ) + parser.add_argument( + "--root_path", + type=str, + required=True, + help="specify the root path of dataset", + ) + parser.add_argument( + "--version", + type=str, + required=True, + help="product version", + ) + parser.add_argument( + "--max_sweeps", + type=int, + required=False, + help="specify sweeps of lidar per example", + ) + parser.add_argument( + "-o", + "--out_dir", + type=str, + required=True, + help="output directory of info file", + ) + parser.add_argument( + "--use_available_dataset_version", + action="store_true", + help="Will resort to using the available dataset version if the one specified in the config file does not exist.", + ) + args = parser.parse_args() + return args + + +def build_dataset_task( + dataset_task: T4DatasetDetection3DPreparation, config: Any, args: Any +) -> DatasetPreparationBase: + """Build DataPreparation based on the task.""" + if dataset_task == DatasetTask.T4DETECTION3D: + assert ( + args.max_sweeps + ), f"max_sweeps must be set when the data preparation task is {T4DatasetDetection3DPreparation.DETECTION3D}." + dataset_preparation = T4DatasetDetection3DPreparation( + root_path=Path(args.root_path), + config=config, + info_save_path=Path(args.outout_dir), + info_version=args.version, + max_sweeps=args.max_sweeps, + use_available_dataset_version=args.use_available_dataset_version, + ) + else: + raise ValueError(f"Task: {dataset_task} not supported yet!") + + print_log(f"Built {dataset_task}") + return dataset_preparation + + +def main(): + """Main enrtypoint to run the Runner.""" + # Load argparse + args = parse_args() + + # load config + config = Config.fromfile(args.config) + + # Build task + dataset_preparation = build_dataset_task(dataset_task=DatasetTask[args.task], config=config, args=args) + + # Run dataset preparation + dataset_preparation.run() + + +if __name__ == "__main__": + main() diff --git a/tools/dataset_preparation/enum.py b/tools/dataset_preparation/enum.py new file mode 100644 index 00000000..2617f8b4 --- /dev/null +++ b/tools/dataset_preparation/enum.py @@ -0,0 +1,27 @@ +from enum import Enum + + +class DatasetInfoSplitKey(Enum): + """Supported split names in data preparation.""" + + TRAIN = "train" + VAL = "val" + TEST = "test" + TRAIN_VAL = "trainval" + ALL = "all" + + def __str__(self): + """String representation.""" + return self.value + + +class DatasetTask(Enum): + """Supported dataset tasks in data preparation.""" + + T4DETECTION3D = "t4_detection3d" + T4DETECTION2D = "t4_detection2d" + T4CLASSIFICATION2D = "t4_classification2d" + + def __str__(self): + """String representation.""" + return self.value diff --git a/tools/detection2d/create_data_t4dataset.py b/tools/detection2d/create_data_t4dataset.py index f86ea762..44fe4063 100644 --- a/tools/detection2d/create_data_t4dataset.py +++ b/tools/detection2d/create_data_t4dataset.py @@ -128,7 +128,7 @@ def assign_ids_and_save_detection_data( for instance in entry.instances ], } - for i, entry in enumerate(detection_data.data_list) + for i, entry in enumerate(detection_da + ta.data_list) ], }