Shortcuts

Source code for mim.commands.gridsearch

# Copyright (c) OpenMMLab. All rights reserved.
import copy as cp
import itertools
import os
import os.path as osp
import subprocess
import sys
import time
from concurrent.futures import ProcessPoolExecutor as Executor
from typing import Optional, Tuple, Union

import click

from mim.click import CustomCommand
from mim.utils import (
    args2string,
    echo_error,
    echo_success,
    exit_with_error,
    get_config,
    get_installed_path,
    highlighted_error,
    is_installed,
    module_full_name,
    recursively_find,
    set_config,
    string2args,
)

PYTHON = sys.executable


@click.command(
    name='gridsearch',
    context_settings=dict(ignore_unknown_options=True),
    cls=CustomCommand)
@click.argument('package', type=str)
@click.argument('config', type=str)
@click.option(
    '-l',
    '--launcher',
    type=click.Choice(['none', 'pytorch', 'slurm'], case_sensitive=False),
    default='none',
    help='Job launcher')
@click.option(
    '--port',
    type=int,
    default=29500,
    help=('The port used for inter-process communication '
          '(only applicable to slurm / pytorch launchers)'))
@click.option(
    '-G', '--gpus', type=int, default=1, help='Number of gpus to use')
@click.option(
    '-g',
    '--gpus-per-node',
    type=int,
    help=('Number of gpus per node to use '
          '(only applicable to launcher == "slurm")'))
@click.option(
    '-c',
    '--cpus-per-task',
    type=int,
    default=2,
    help='Number of cpus per task (only applicable to launcher == "slurm")')
@click.option(
    '-p',
    '--partition',
    type=str,
    help='The partition to use (only applicable to launcher == "slurm")')
@click.option(
    '-j', '--max-jobs', type=int, help='Max parallel number', default=1)
@click.option(
    '--srun-args', type=str, help='Other srun arguments that might be used')
@click.option('-y', '--yes', is_flag=True, help='Don\'t ask for confirmation.')
@click.option(
    '--search-args', type=str, help='Arguments for hyper parameters search')
@click.argument('other_args', nargs=-1, type=click.UNPROCESSED)
def cli(package: str,
        config: str,
        gpus: int,
        gpus_per_node: int,
        partition: str,
        cpus_per_task: int = 2,
        max_jobs: int = 1,
        launcher: str = 'none',
        port: int = 29500,
        srun_args: Optional[str] = None,
        search_args: str = '',
        yes: bool = False,
        other_args: tuple = ()) -> None:
    """Perform Hyper-parameter search.

    Example:

    \b
    # Parameter search on a single server with CPU by setting `gpus` to 0 and
    # 'launcher' to 'none' (if applicable). The training script of the
    # corresponding codebase will fail if it doesn't support CPU training.
    > mim gridsearch mmcls resnet101_b16x8_cifar10.py --work-dir tmp --gpus \
        0 --search-args '--optimizer.lr 1e-2 1e-3'
    # Parameter search with on a single server with one GPU, search learning
    # rate
    > mim gridsearch mmcls resnet101_b16x8_cifar10.py --work-dir tmp --gpus \
        1 --search-args '--optimizer.lr 1e-2 1e-3'
    # Parameter search with on a single server with one GPU, search
    # weight_decay
    > mim gridsearch mmcls resnet101_b16x8_cifar10.py --work-dir tmp --gpus \
        1 --search-args '--optimizer.weight_decay 1e-3 1e-4'
    # Parameter search with on a single server with one GPU, search learning
    # rate and weight_decay
    > mim gridsearch mmcls resnet101_b16x8_cifar10.py --work-dir tmp --gpus \
        1 --search-args '--optimizer.lr 1e-2 1e-3 --optimizer.weight_decay \
        1e-3 1e-4'
    # Parameter search on a slurm HPC with one 8-GPU node, search learning
    # rate and weight_decay
    > mim gridsearch mmcls resnet101_b16x8_cifar10.py --work-dir tmp --gpus \
        8 --partition partition_name --gpus-per-node 8 --launcher slurm \
        --search-args '--optimizer.lr 1e-2 1e-3 --optimizer.weight_decay 1e-3 \
        1e-4'
    # Parameter search on a slurm HPC with one 8-GPU node, search learning
    # rate and weight_decay, max parallel jobs is 2
    > mim gridsearch mmcls resnet101_b16x8_cifar10.py --work-dir tmp --gpus \
        8 --partition partition_name --gpus-per-node 8 --launcher slurm \
        --max-jobs 2 --search-args '--optimizer.lr 1e-2 1e-3 \
        --optimizer.weight_decay 1e-3 1e-4'
    # Print the help message of sub-command search
    > mim gridsearch -h
    # Print the help message of sub-command search and the help message of the
    # training script of codebase mmcls
    > mim gridsearch mmcls -h
    """

    is_success, msg = gridsearch(
        package=package,
        config=config,
        gpus=gpus,
        gpus_per_node=gpus_per_node,
        cpus_per_task=cpus_per_task,
        max_jobs=max_jobs,
        partition=partition,
        launcher=launcher,
        port=port,
        srun_args=srun_args,
        search_args=search_args,
        yes=yes,
        other_args=other_args)

    if is_success:
        echo_success(msg)  # type: ignore
    else:
        exit_with_error(msg)


[docs]def gridsearch( package: str, config: str, gpus: int, gpus_per_node: int = None, cpus_per_task: int = 2, max_jobs: int = 1, partition: str = None, launcher: str = 'none', port: int = 29500, srun_args: Optional[str] = None, search_args: str = '', yes: bool = True, other_args: tuple = () ) -> Tuple[bool, Union[str, Exception]]: """Hyper parameter search with given config. Args: package (str): The codebase name. config (str): The config file path. If not exists, will search in the config files of the codebase. gpus (int): Number of gpus used for training. gpus_per_node (int, optional): Number of gpus per node to use (only applicable to launcher == "slurm"). Defaults to None. cpus_per_task (int, optional): Number of cpus per task to use (only applicable to launcher == "slurm"). Defaults to None. partition (str, optional): The partition name (only applicable to launcher == "slurm"). Defaults to None. max_jobs (int, optional): The max number of workers. Applicable only if launcher == 'slurm'. Default to 1. launcher (str, optional): The launcher used to launch jobs. Defaults to 'none'. port (int, optional): The port used for inter-process communication (only applicable to slurm / pytorch launchers). Default to 29500. srun_args (str, optional): Other srun arguments that might be used, all arguments should be in a string. Defaults to None. search_args (str, optional): Arguments for hyper parameters search, all arguments should be in a string. Defaults to None. yes (bool): Don\'t ask for confirmation. Default: True. other_args (tuple, optional): Other arguments, will be passed to the codebase's training script. Defaults to (). """ full_name = module_full_name(package) if full_name == '': msg = f"Can't determine a unique package given abbreviation {package}" raise ValueError(highlighted_error(msg)) package = full_name # If launcher == "slurm", must have following args if launcher == 'slurm': msg = ('If launcher is slurm, ' 'gpus-per-node and partition should not be None') flag = (gpus_per_node is not None) and (partition is not None) if not flag: raise AssertionError(highlighted_error(msg)) if not is_installed(package): msg = (f'The codebase {package} is not installed, ' 'do you want to install it? ') if yes or click.confirm(msg): click.echo(f'Installing {package}') cmd = ['mim', 'install', package] ret = subprocess.check_call(cmd) if ret != 0: msg = f'{package} is not successfully installed' raise RuntimeError(highlighted_error(msg)) else: click.echo(f'{package} is successfully installed') else: msg = f'You can not train this model without {package} installed.' return False, msg pkg_root = get_installed_path(package) if not osp.exists(config): # configs is put in pkg/.mim in PR #68 config_root = osp.join(pkg_root, '.mim', 'configs') if not osp.exists(config_root): # If not pkg/.mim/config, try to search the whole pkg root. config_root = pkg_root # pkg/.mim/configs is a symbolic link to the real config folder, # so we need to follow links. files = recursively_find( pkg_root, osp.basename(config), followlinks=True) if len(files) == 0: msg = (f"The path {config} doesn't exist and we can not " f'find the config file in codebase {package}.') raise ValueError(highlighted_error(msg)) elif len(files) > 1: msg = ( f"The path {config} doesn't exist and we find multiple " f'config files with same name in codebase {package}: {files}.') raise ValueError(highlighted_error(msg)) # Use realpath instead of the symbolic path in pkg/.mim config_path = osp.realpath(files[0]) click.echo( f"The path {config} doesn't exist but we find the config file " f'in codebase {package}, will use {config_path} instead.') config = config_path # tools will be put in package/.mim in PR #68 train_script = osp.join(pkg_root, '.mim', 'tools', 'train.py') if not osp.exists(train_script): train_script = osp.join(pkg_root, 'tools', 'train.py') # parsing search_args # the search_args looks like: # "--optimizer.lr 0.001 0.01 0.1 --optimizer.weight_decay 1e-4 1e-3 1e-2" search_args_dict = string2args(search_args) if not len(search_args_dict): msg = 'Should specify at least one arg for searching' raise ValueError(highlighted_error(msg)) for k in search_args_dict: if search_args_dict[k] is bool: msg = f'Should specify at least one value for arg {k}' raise ValueError(highlighted_error(msg)) try: from mmengine import Config except ImportError: try: from mmcv import Config except ImportError: raise ImportError( 'Please install mmengine to use the gridsearch command: ' '`mim install mmengine`.') cfg = Config.fromfile(config) for arg in search_args_dict: try: arg_value = get_config(cfg, arg) if arg_value is not None and not isinstance(arg_value, str): search_args_dict[arg] = [ eval(x) for x in search_args_dict[arg] ] for val in search_args_dict[arg]: assert type(val) == type(arg_value) except AssertionError: msg = f'Arg {arg} not in the config file. ' raise AssertionError(highlighted_error(msg)) other_args_dict = string2args(' '.join(other_args)) work_dir = other_args_dict.get('work-dir') if work_dir: work_dir = work_dir[0] else: work_dir = cfg.get('work_dir') if work_dir is None: msg = 'work_dir is not specified' raise AssertionError(highlighted_error(msg)) cfg.pop('work_dir', None) # remove redundant '/' at the end of work_dir assert work_dir # To pass mypy test while work_dir.endswith('/'): work_dir = work_dir[:-1] config_tmpl, config_suffix = osp.splitext(osp.basename(config)) work_dir_tmpl = work_dir cmds = [] exp_names = [] arg_names = [k for k in search_args_dict] arg_values = [search_args_dict[k] for k in arg_names] for combination in itertools.product(*arg_values): cur_cfg = cp.deepcopy(cfg) # `Config.__init__` does not accept a `Config` object. However, # since mmcv v1.4.6, `deepcopy` a `Config` object also returns a # `Config` object so we need to ensure cur_cfg is not a Config object # before passing to `Config.__init__` for backward compatibility. # See https://github.com/open-mmlab/mmcv/pull/1658 for more details. if not isinstance(cur_cfg, Config): cur_cfg = Config(cur_cfg) suffix_list = [] for k, v in zip(arg_names, combination): suffix_list.extend([k, str(v)]) set_config(cur_cfg, k, v) name_suffix = '_search_' + '_'.join(suffix_list) work_dir = work_dir_tmpl + name_suffix os.makedirs(work_dir, exist_ok=True) config_name = config_tmpl + name_suffix + config_suffix exp_names.append(config_tmpl + name_suffix) config_path = osp.join(work_dir, config_name) with open(config_path, 'w') as fout: fout.write(cur_cfg.pretty_text) other_args_dict_ = cp.deepcopy(other_args_dict) other_args_dict_['work-dir'] = [work_dir] other_args_str = args2string(other_args_dict_) common_args = ['--launcher', launcher] + other_args_str.split() if launcher == 'none': cmd = [PYTHON, train_script, config_path] + common_args help_msg = subprocess.check_output([PYTHON, train_script, '-h']) if '--gpus' in help_msg.decode(): # OpenMMLab 1.0 should add the `--gpus` or `--device` flags. if gpus: cmd += ['--gpus', str(gpus)] else: cmd += ['--device', 'cpu'] elif launcher == 'pytorch': cmd = [ PYTHON, '-m', 'torch.distributed.launch', f'--nproc_per_node={gpus}', f'--master_port={port}', train_script, config_path ] + common_args elif launcher == 'slurm': parsed_srun_args = srun_args.split() if srun_args else [] has_job_name = any([('--job-name' in x) or ('-J' in x) for x in parsed_srun_args]) if not has_job_name: job_name = osp.splitext(osp.basename(config_path))[0] parsed_srun_args.append(f'--job-name={job_name}_train') cmd = [ 'srun', '-p', f'{partition}', f'--gres=gpu:{gpus_per_node}', f'--ntasks={gpus}', f'--ntasks-per-node={gpus_per_node}', f'--cpus-per-task={cpus_per_task}', '--kill-on-bad-exit=1' ] + parsed_srun_args + [PYTHON, '-u', train_script, config_path ] + common_args cmds.append(cmd) time.sleep(5) succeed_list, fail_list = [], [] if launcher in ['none', 'pytorch']: for cmd, exp_name in zip(cmds, exp_names): cmd_text = ' '.join(cmd) click.echo(f'Training command for exp {exp_name} is {cmd_text}. ') ret = subprocess.check_call( cmd, env=dict(os.environ, MASTER_PORT=str(port))) if ret == 0: click.echo(f'Exp {exp_name} finished successfully.') succeed_list.append(exp_name) else: echo_error('Training not finished successfully.') fail_list.append(exp_name) elif launcher == 'slurm': with Executor(max_workers=max_jobs) as executor: for exp, ret in zip(exp_names, executor.map(subprocess.check_call, cmds)): if ret == 0: click.echo(f'Exp {exp} finished successfully.') succeed_list.append(exp) else: echo_error(f'Exp {exp} not finished successfully.') fail_list.append(exp) if len(fail_list): msg = ('The following experiments in hyper parameter search ' f'failed: \n {fail_list}') return False, msg else: msg = ('The hyper parameter search finished successfully.' f'Experiment list: \n {succeed_list}') return True, msg
Read the Docs v: latest
Versions
latest
stable
Downloads
On Read the Docs
Project Home
Builds

Free document hosting provided by Read the Docs.