diff --git a/mmdet/apis/env.py b/mmdet/apis/env.py index 20cd26dee8fbc258ffd4c50fef6e8468bf4ba094..19b0f86db136b93cf118f45623bed78c1e00791a 100644 --- a/mmdet/apis/env.py +++ b/mmdet/apis/env.py @@ -1,6 +1,7 @@ import logging import os import random +import subprocess import numpy as np import torch @@ -34,8 +35,19 @@ def _init_dist_mpi(backend, **kwargs): raise NotImplementedError -def _init_dist_slurm(backend, **kwargs): - raise NotImplementedError +def _init_dist_slurm(backend, port=29500, **kwargs): + proc_id = int(os.environ['SLURM_PROCID']) + ntasks = int(os.environ['SLURM_NTASKS']) + node_list = os.environ['SLURM_NODELIST'] + num_gpus = torch.cuda.device_count() + torch.cuda.set_device(proc_id % num_gpus) + addr = subprocess.getoutput( + 'scontrol show hostname {} | head -n1'.format(node_list)) + os.environ['MASTER_PORT'] = str(port) + os.environ['MASTER_ADDR'] = addr + os.environ['WORLD_SIZE'] = str(ntasks) + os.environ['RANK'] = str(proc_id) + dist.init_process_group(backend=backend) def set_random_seed(seed): diff --git a/tools/slurm_test.sh b/tools/slurm_test.sh new file mode 100755 index 0000000000000000000000000000000000000000..be2ab9d6a5befcc673023989dfc32578b5612a04 --- /dev/null +++ b/tools/slurm_test.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +set -x + +PARTITION=$1 +JOB_NAME=$2 +CONFIG=$3 +CHECKPOINT=$4 +GPUS=${GPUS:-8} +CPUS_PER_TASK=${CPUS_PER_TASK:-32} +PY_ARGS=${@:5} +SRUN_ARGS=${SRUN_ARGS:-""} + +srun -p ${PARTITION} \ + --job-name=${JOB_NAME} \ + --gres=gpu:${GPUS} \ + --ntasks=1 \ + --ntasks-per-node=1 \ + --cpus-per-task=${CPUS_PER_TASK} \ + --kill-on-bad-exit=1 \ + ${SRUN_ARGS} \ + python tools/test.py ${CONFIG} ${CHECKPOINT} --gpus ${GPUS} ${PY_ARGS} diff --git a/tools/slurm_train.sh b/tools/slurm_train.sh new file mode 100755 index 0000000000000000000000000000000000000000..45474c46aa4e42c9061342e74f01b22ece9323f9 --- /dev/null +++ b/tools/slurm_train.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +set -x + +PARTITION=$1 +JOB_NAME=$2 +CONFIG=$3 +WORK_DIR=$4 +GPUS=${5:-8} +GPUS_PER_NODE=${GPUS_PER_NODE:-8} +CPUS_PER_TASK=${CPUS_PER_TASK:-5} +SRUN_ARGS=${SRUN_ARGS:-""} +PY_ARGS=${PY_ARGS:-"--validate"} + +srun -p ${PARTITION} \ + --job-name=${JOB_NAME} \ + --gres=gpu:${GPUS_PER_NODE} \ + --ntasks=${GPUS} \ + --ntasks-per-node=${GPUS_PER_NODE} \ + --cpus-per-task=${CPUS_PER_TASK} \ + --kill-on-bad-exit=1 \ + ${SRUN_ARGS} \ + python -u tools/train.py ${CONFIG} --work_dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS}