Ricardo Garcia
Ricardo Garcia

Reputation: 1

Can't configure properly Gres for Slurm

I'm configuring my gpu cluster and I have some problems. This is mi slurm.conf file:

sudo cat /etc/slurm-llnl/slurm.conf 
ClusterName=emotions
SlurmctldHost=oceano
#SlurmctldHost=
#
#DisableRootJobs=NO
#EnforcePartLimits=NO
#Epilog=
#EpilogSlurmctld=
#FirstJobId=1
#MaxJobId=67043328
GresTypes=gpu
#GroupUpdateForce=0
#GroupUpdateTime=600
#JobFileAppend=0
#JobRequeue=1
#JobSubmitPlugins=lua
#KillOnBadExit=0
#LaunchType=launch/slurm
#Licenses=foo*4,bar
#MailProg=/bin/mail
#MaxJobCount=10000
#MaxStepCount=40000
#MaxTasksPerNode=512
#MpiDefault=
#MpiParams=ports=#-#
#PluginDir=
#PlugStackConfig=
#PrivateData=jobs
#ProctrackType=proctrack/cgroup
ProctrackType=proctrack/linuxproc
#Prolog=
#PrologFlags=
#PrologSlurmctld=
#PropagatePrioProcess=0
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#RebootProgram=
ReturnToService=1
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/var/run/slurmd.pid
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurmd
SlurmUser=slurm
#SlurmdUser=root
#SrunEpilog=
#SrunProlog=
StateSaveLocation=/var/spool/slurmctld
#SwitchType=
#TaskEpilog=
TaskPlugin=task/affinity,task/cgroup
#TaskProlog=
#TopologyPlugin=topology/tree
#TmpFS=/tmp
#TrackWCKey=no
#TreeWidth=
#UnkillableStepProgram=
#UsePAM=0
#
#
# TIMERS
#BatchStartTimeout=10
#CompleteWait=0
#EpilogMsgTime=2000
#GetEnvTimeout=2
#HealthCheckInterval=0
#HealthCheckProgram=
InactiveLimit=0
KillWait=30
#MessageTimeout=10
#ResvOverRun=0
MinJobAge=300
#OverTimeLimit=0
SlurmctldTimeout=120
SlurmdTimeout=300
#UnkillableStepTimeout=60
#VSizeFactor=0
Waittime=0
#
#
# SCHEDULING
#DefMemPerCPU=0
#MaxMemPerCPU=0
#SchedulerTimeSlice=30
SchedulerType=sched/backfill
SelectType=select/cons_tres
SelectTypeParameters=CR_CPU
#
#
# JOB PRIORITY
#PriorityFlags=
#PriorityType=priority/multifactor
#PriorityDecayHalfLife=
#PriorityCalcPeriod=
#PriorityFavorSmall=
#PriorityMaxAge=
#PriorityUsageResetPeriod=
#PriorityWeightAge=
#PriorityWeightFairshare=
#PriorityWeightJobSize=
#PriorityWeightPartition=
#PriorityWeightQOS=
#
#
# LOGGING AND ACCOUNTING
#AccountingStorageEnforce=0
#AccountingStorageHost=
#AccountingStoragePass=
#AccountingStoragePort=
#AccountingStorageType=
#AccountingStorageUser=
#AccountingStoreFlags=
#JobCompHost=
#JobCompLoc=
#JobCompParams=
#JobCompPass=
#JobCompPort=
JobCompType=jobcomp/none
#JobCompUser=
#JobContainerType=
JobAcctGatherFrequency=30
#JobAcctGatherType=
SlurmctldDebug=info
SlurmctldLogFile=/var/log/slurmctld.log
SlurmdDebug=info
SlurmdLogFile=/var/log/slurmd.log
#SlurmSchedLogFile=
#SlurmSchedLogLevel=
#DebugFlags=
#
#
# POWER SAVE SUPPORT FOR IDLE NODES (optional)
#SuspendProgram=
#ResumeProgram=
#SuspendTimeout=
#ResumeTimeout=
#ResumeRate=
#SuspendExcNodes=
#SuspendExcParts=
#SuspendRate=
#SuspendTime=
#
#
# COMPUTE NODES
NodeName=prometeo                    CPUs=16 RealMemory=64137 Sockets=1 CoresPerSocket=8 ThreadsPerCore=2 Gres=gpu:1 State=UNKNOWN
NodeName=Reineta                     CPUs=16 RealMemory=80281 Sockets=1 CoresPerSocket=8 ThreadsPerCore=2 Gres=gpu:1 State=UNKNOWN
NodeName=crio  NodeAddr=172.17.30.29 CPUs=8  RealMemory=32048 Sockets=1 CoresPerSocket=4 ThreadsPerCore=2 State=UNKNOWN
NodeName=atlas NodeAddr=172.17.30.19 Gres=gpu:1 CPUs=8  RealMemory=32050 Sockets=1 CoresPerSocket=4 ThreadsPerCore=2 State=UNKNOWN


PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP
PartitionName=gpu Nodes=Reineta,atlas Default=NO MaxTime=INFINITE State=UP

This is my gres.conf file:

NodeName=Reineta Name=gpu File=/dev/nvidia0
NodeName=prometeo Name=gpu File=/dev/nvidia0
NodeName=atlas Name=gpu File=/dev/nvidia0

I have run some basics commands in order use the gpu:

scontrol show node
NodeName=Reineta Arch=x86_64 CoresPerSocket=8 
   CPUAlloc=0 CPUTot=16 CPULoad=7.98
   AvailableFeatures=(null)
   ActiveFeatures=(null)
   Gres=gpu:1
   NodeAddr=Reineta NodeHostName=Reineta Version=19.05.5
   OS=Linux 5.15.0-127-generic #137~20.04.1-Ubuntu SMP Fri Nov 15 14:46:54 UTC 2024 
   RealMemory=80281 AllocMem=0 FreeMem=17212 Sockets=1 Boards=1
   State=IDLE ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
   Partitions=debug,gpu 
   BootTime=2024-12-30T15:16:42 SlurmdStartTime=2025-01-06T16:10:57
   CfgTRES=cpu=16,mem=80281M,billing=16
   AllocTRES=
   CapWatts=n/a
   CurrentWatts=0 AveWatts=0
   ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
   

NodeName=atlas Arch=x86_64 CoresPerSocket=4 
   CPUAlloc=0 CPUTot=8 CPULoad=1.46
   AvailableFeatures=(null)
   ActiveFeatures=(null)
   Gres=gpu:1
   NodeAddr=172.17.30.19 NodeHostName=atlas Version=19.05.5
   OS=Linux 5.15.0-125-generic #135~20.04.1-Ubuntu SMP Mon Oct 7 13:56:22 UTC 2024 
   RealMemory=32050 AllocMem=0 FreeMem=11249 Sockets=1 Boards=1
   State=IDLE ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
   Partitions=debug,gpu 
   BootTime=2024-12-12T10:55:53 SlurmdStartTime=2025-01-06T16:11:20
   CfgTRES=cpu=8,mem=32050M,billing=8
   AllocTRES=
   CapWatts=n/a
   CurrentWatts=0 AveWatts=0
   ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
   

NodeName=crio Arch=x86_64 CoresPerSocket=4 
   CPUAlloc=0 CPUTot=8 CPULoad=0.00
   AvailableFeatures=(null)
   ActiveFeatures=(null)
   Gres=(null)
   NodeAddr=172.17.30.29 NodeHostName=crio Version=19.05.5
   OS=Linux 5.15.0-127-generic #137~20.04.1-Ubuntu SMP Fri Nov 15 14:46:54 UTC 2024 
   RealMemory=32048 AllocMem=0 FreeMem=29934 Sockets=1 Boards=1
   State=IDLE+DRAIN ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
   Partitions=debug 
   BootTime=2025-01-02T17:05:54 SlurmdStartTime=2025-01-02T17:20:19
   CfgTRES=cpu=8,mem=32048M,billing=8
   AllocTRES=
   CapWatts=n/a
   CurrentWatts=0 AveWatts=0
   ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
   Reason=Low RealMemory [root@2025-01-02T17:21:55]

NodeName=prometeo Arch=x86_64 CoresPerSocket=8 
   CPUAlloc=0 CPUTot=16 CPULoad=0.67
   AvailableFeatures=(null)
   ActiveFeatures=(null)
   Gres=gpu:1
   NodeAddr=prometeo NodeHostName=prometeo Version=19.05.5
   OS=Linux 5.15.0-125-generic #135~20.04.1-Ubuntu SMP Mon Oct 7 13:56:22 UTC 2024 
   RealMemory=64137 AllocMem=0 FreeMem=17236 Sockets=1 Boards=1
   State=IDLE+DRAIN ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
   Partitions=debug 
   BootTime=2024-12-12T15:04:19 SlurmdStartTime=2025-01-02T17:24:45
   CfgTRES=cpu=16,mem=64137M,billing=16
   AllocTRES=
   CapWatts=n/a
   CurrentWatts=0 AveWatts=0
   ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
   Reason=debug [root@2025-01-06T10:08:43]

Here it seems to be alright, but when I try to run jobs with gpu, it does not work properly:

This works

srun --nodes=2 --ntasks=2 nvidia-smi
Mon Jan  6 16:29:04 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
Mon Jan  6 16:29:04 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA GeForce RTX 3080 Ti     Off | 00000000:01:00.0  On |                  N/A |
|  0%   55C    P8              40W / 350W |    826MiB / 12288MiB |      1%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                                         
+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
|    0   N/A  N/A      2269      G   /usr/lib/xorg/Xorg                           51MiB |
|    0   N/A  N/A      3009      G   /usr/lib/xorg/Xorg                          230MiB |
|    0   N/A  N/A      3138      G   /usr/bin/gnome-shell                         93MiB |
|    0   N/A  N/A   1165905      G   ...erProcess --variations-seed-version       29MiB |
|    0   N/A  N/A   2457722      C   python3                                     390MiB |
+---------------------------------------------------------------------------------------+
|   0  NVIDIA GeForce GTX 1080        Off | 00000000:01:00.0 Off |                  N/A |
|  0%   48C    P8              11W / 240W |     30MiB /  8192MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                                         
+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
|    0   N/A  N/A      1028      G   /usr/lib/xorg/Xorg                           18MiB |
|    0   N/A  N/A      1100      G   /usr/bin/gnome-shell                          7MiB |
+---------------------------------------------------------------------------------------+

But this does not work:

srun --partition=gpu --nodes=2 --gres=gpu:1 nvidia-smi
srun: error: Unable to allocate resources: Requested node configuration is not available

Thanks in advance!

I have tried to configure gres.conf but it doesn't worked. I need to run my gpu cluster.

Upvotes: 0

Views: 65

Answers (0)

Related Questions