Source code for Hive_scripts.Hive_create_subset
#!/usr/bin/env python
import datetime
import json
import random
import shutil
from argparse import ArgumentParser, RawTextHelpFormatter
from pathlib import Path
from textwrap import dedent
from Hive.utils.log_utils import add_verbosity_options_to_argparser
TIMESTAMP = "{:%Y-%m-%d_%H-%M-%S}".format(datetime.datetime.now())
DESC = dedent(
"""
Generates and saves a subset, given a dataset. The subset data are extracted from the original dataset according to the
provided ``classes``. A JSON file mapping each subject to the corresponding class is needed ( ``data_class_file``).
An optional parameter ``max_size`` can be specified to limit the size of the subset.
""" # noqa: E501
)
EPILOG = dedent(
"""
{filename} --data-folder /PATH/TO/DATASET --output-folder /PATH/TO/SUBSET --data-class-file /PATH/TO/SUBJECT_CLASSES.json --subclasses CLASS_1
{filename} --data-folder /PATH/TO/DATASET --output-folder /PATH/TO/SUBSET --data-class-file /PATH/TO/SUBJECT_CLASSES.json --subclasses CLASS_1 --max-size 100
""".format( # noqa: E501
filename=Path(__file__).stem
)
)
[docs]def main():
parser = get_arg_parser()
arguments = vars(parser.parse_args())
with open(arguments["data_class_file"], "r") as fp:
data_class_dict = json.load(fp)
count = 0
classes = arguments["subclasses"]
max_size = arguments["max_size"]
Path(arguments["output_folder"]).mkdir(parents=True, exist_ok=True)
if max_size is None:
max_size = len(data_class_dict)
patients = list(data_class_dict.keys())
random.shuffle(patients)
for patient in patients:
if data_class_dict[patient] in classes and count <= int(max_size):
if Path(arguments["output_folder"]).joinpath(patient).is_dir():
...
else:
shutil.copytree(
Path(arguments["data_folder"]).joinpath(patient), Path(arguments["output_folder"]).joinpath(patient)
)
count += 1
[docs]def get_arg_parser():
pars = ArgumentParser(description=DESC, formatter_class=RawTextHelpFormatter)
pars.add_argument(
"--data-folder",
type=str,
required=True,
help="Input dataset folder",
)
pars.add_argument(
"--output-folder",
type=str,
required=True,
help="Output subset folder",
)
pars.add_argument(
"--data-class-file",
type=str,
required=True,
help="JSON file including the class label for each volume in the dataset.",
)
pars.add_argument(
"--subclasses",
type=str,
nargs="+",
required=True,
help="List of classes from where to select the subset data.",
)
pars.add_argument(
"--max-size",
type=str,
required=False,
help="Maximum size of the generated subset. Default ``None```: no size limit is set.",
)
add_verbosity_options_to_argparser(pars)
return pars
if __name__ == "__main__":
main()