#!/usr/bin/env python
import csv
import datetime
import importlib.resources
import json
import numpy as np
import os
from argparse import ArgumentParser, RawTextHelpFormatter
from pathlib import Path
from sklearn.model_selection import KFold
from textwrap import dedent
import Hive.configs
from Hive.utils.file_utils import (
create_nndet_data_folder_tree,
split_dataset,
copy_data_to_dataset_folder,
save_config_json,
generate_dataset_json,
)
from Hive.utils.log_utils import get_logger, add_verbosity_options_to_argparser, log_lvl_from_verbosity_args
TIMESTAMP = "{:%Y-%m-%d_%H-%M-%S}".format(datetime.datetime.now())
DESC = dedent(
"""
Prepare Dataset folder according to the nnDetection specifications, creating and populating the subfolders ``imagesTr``,
``labelsTr``, ``imagesTs`` and ``labelsTs``. In addition, a JSON instance configuration file (as required by nnDetection)
for each label mask is generated, alongside a summary of the train/test split of the dataset.
The label mask images are expected to be as instance segmentation masks (NOT semantic segmentation representations).
""" # noqa: E501
)
EPILOG = dedent(
"""
Example call:
::
{filename} -i /PATH/TO/DATA_FOLDER --task-ID 000 --task-name Example --config-file Example_config.json
{filename} -i /PATH/TO/DATA_FOLDER --task-ID 000 --task-name Example --config-file Example_config.json --test-split 30
""".format( # noqa: E501
filename=Path(__file__).stem
)
)
[docs]def main():
parser = get_arg_parser()
arguments = vars(parser.parse_args())
logger = get_logger(
name=Path(__file__).name,
level=log_lvl_from_verbosity_args(arguments),
)
try:
with open(arguments["config_file"]) as json_file:
config_dict = json.load(json_file)
except FileNotFoundError:
with importlib.resources.path(Hive.configs, arguments["config_file"]) as json_path:
with open(json_path) as json_file:
config_dict = json.load(json_file)
os.environ["raw_data_base"] = str(Path(os.environ["ROOT_FOLDER"]).joinpath(config_dict["Experiment Name"]))
os.environ["preprocessed_folder"] = str(
Path(os.environ["ROOT_FOLDER"]).joinpath(
config_dict["Experiment Name"], "Task" + arguments["task_ID"] + "_" + arguments["task_name"], "preprocessed"
)
)
os.environ["RESULTS_FOLDER"] = str(
Path(os.environ["ROOT_FOLDER"]).joinpath(
config_dict["Experiment Name"], "Task" + arguments["task_ID"] + "_" + arguments["task_name"], "results"
)
)
try:
dataset_path = str(
Path(os.environ["raw_data_base"]).joinpath(
"Task" + arguments["task_ID"] + "_" + arguments["task_name"], "raw_splitted"
)
)
except KeyError:
logger.error("raw_data_base is not set as environment variable")
return 1
create_nndet_data_folder_tree(
os.environ["raw_data_base"],
arguments["task_name"],
arguments["task_ID"],
)
train_dataset, test_dataset = split_dataset(arguments["input_data_folder"], arguments["test_split"], config_dict["Seed"])
dataset_split = []
for test_subject in test_dataset:
dataset_split_dict = {"Subject": test_subject, "Split": "Testing"}
dataset_split.append(dataset_split_dict)
train_dataset_sorted = np.sort(train_dataset)
kfold = KFold(n_splits=config_dict["n_folds"], shuffle=True, random_state=config_dict["Seed"])
for i, (train_idx, test_idx) in enumerate(kfold.split(train_dataset_sorted)):
for test in test_idx:
dataset_split_dict = {"Subject": train_dataset_sorted[test], "Split": "Validation_fold_{}".format(i)}
dataset_split.append(dataset_split_dict)
dataset_split_summary = Path(os.environ["ROOT_FOLDER"]).joinpath(
config_dict["Experiment Name"], "dataset_split.csv"
)
with open(dataset_split_summary, "w") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=["Subject", "Split"])
writer.writeheader()
for data in dataset_split:
writer.writerow(data)
config_dict["dataset_folder"] = Path(arguments["input_data_folder"]).stem.replace("_", " ")
copy_data_to_dataset_folder(
arguments["input_data_folder"],
train_dataset,
Path(dataset_path).joinpath("imagesTr"),
config_dict,
Path(dataset_path).joinpath("labelsTr"),
save_label_instance_config=True,
)
copy_data_to_dataset_folder(
arguments["input_data_folder"],
test_dataset,
Path(dataset_path).joinpath("imagesTs"),
config_dict,
Path(dataset_path).joinpath("labelsTs"),
save_label_instance_config=True,
)
generate_dataset_json(
str(
Path(os.environ["raw_data_base"]).joinpath(
"Task" + arguments["task_ID"] + "_" + arguments["task_name"], "dataset.json"
)
),
train_dataset,
test_dataset,
list(config_dict["Modalities"].values()),
config_dict["label_dict"],
task_name="Task{}_{}".format(arguments["task_ID"], arguments["task_name"]),
file_extension=config_dict["FileExtension"],
)
config_dict["Task_ID"] = arguments["task_ID"]
config_dict["Task_Name"] = arguments["task_name"]
config_dict["train_test_split"] = arguments["test_split"]
config_dict["base_folder"] = os.environ["raw_data_base"]
output_json_basename = "Task" + arguments["task_ID"] + "_" + config_dict["Experiment Name"] + ".json"
try:
config_dict["results_folder"] = os.environ["RESULTS_FOLDER"]
Path(config_dict["results_folder"]).mkdir(parents=True, exist_ok=True)
except KeyError:
logger.warning("RESULTS_FOLDER is not set as environment variable, {} is not saved".format(output_json_basename))
return 1
try:
config_dict["preprocessing_folder"] = os.environ["preprocessed_folder"]
Path(config_dict["preprocessing_folder"]).mkdir(parents=True, exist_ok=True)
except KeyError:
logger.warning(
"preprocessed_folder is not set as environment variable, not saved in {}".format(output_json_basename)
# noqa E501
)
save_config_json(config_dict, str(Path(config_dict["results_folder"]).joinpath(output_json_basename)))
[docs]def get_arg_parser():
pars = ArgumentParser(description=DESC, epilog=EPILOG, formatter_class=RawTextHelpFormatter)
pars.add_argument(
"-i",
"--input-data-folder",
type=str,
required=True,
help="Input Dataset folder",
)
pars.add_argument(
"--task-ID",
type=str,
default="100",
help="Task ID used in the folder path tree creation (Default: 100)",
)
pars.add_argument(
"--task-name",
type=str,
required=True,
help="Task Name used in the folder path tree creation.", # noqa E501
)
pars.add_argument(
"--test-split",
type=int,
choices=range(0, 101),
metavar="[0-100]",
default=20,
help="Split value ( in %% ) to create Test set from Dataset (Default: 20)",
)
pars.add_argument(
"--config-file",
type=str,
required=True,
help="Configuration JSON file with experiment and dataset parameters.",
)
add_verbosity_options_to_argparser(pars)
return pars
if __name__ == "__main__":
main()