Reputation: 1
I'm configuring my gpu cluster and I have some problems. This is mi slurm.conf file:
sudo cat /etc/slurm-llnl/slurm.conf
ClusterName=emotions
SlurmctldHost=oceano
#SlurmctldHost=
#
#DisableRootJobs=NO
#EnforcePartLimits=NO
#Epilog=
#EpilogSlurmctld=
#FirstJobId=1
#MaxJobId=67043328
GresTypes=gpu
#GroupUpdateForce=0
#GroupUpdateTime=600
#JobFileAppend=0
#JobRequeue=1
#JobSubmitPlugins=lua
#KillOnBadExit=0
#LaunchType=launch/slurm
#Licenses=foo*4,bar
#MailProg=/bin/mail
#MaxJobCount=10000
#MaxStepCount=40000
#MaxTasksPerNode=512
#MpiDefault=
#MpiParams=ports=#-#
#PluginDir=
#PlugStackConfig=
#PrivateData=jobs
#ProctrackType=proctrack/cgroup
ProctrackType=proctrack/linuxproc
#Prolog=
#PrologFlags=
#PrologSlurmctld=
#PropagatePrioProcess=0
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#RebootProgram=
ReturnToService=1
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/var/run/slurmd.pid
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurmd
SlurmUser=slurm
#SlurmdUser=root
#SrunEpilog=
#SrunProlog=
StateSaveLocation=/var/spool/slurmctld
#SwitchType=
#TaskEpilog=
TaskPlugin=task/affinity,task/cgroup
#TaskProlog=
#TopologyPlugin=topology/tree
#TmpFS=/tmp
#TrackWCKey=no
#TreeWidth=
#UnkillableStepProgram=
#UsePAM=0
#
#
# TIMERS
#BatchStartTimeout=10
#CompleteWait=0
#EpilogMsgTime=2000
#GetEnvTimeout=2
#HealthCheckInterval=0
#HealthCheckProgram=
InactiveLimit=0
KillWait=30
#MessageTimeout=10
#ResvOverRun=0
MinJobAge=300
#OverTimeLimit=0
SlurmctldTimeout=120
SlurmdTimeout=300
#UnkillableStepTimeout=60
#VSizeFactor=0
Waittime=0
#
#
# SCHEDULING
#DefMemPerCPU=0
#MaxMemPerCPU=0
#SchedulerTimeSlice=30
SchedulerType=sched/backfill
SelectType=select/cons_tres
SelectTypeParameters=CR_CPU
#
#
# JOB PRIORITY
#PriorityFlags=
#PriorityType=priority/multifactor
#PriorityDecayHalfLife=
#PriorityCalcPeriod=
#PriorityFavorSmall=
#PriorityMaxAge=
#PriorityUsageResetPeriod=
#PriorityWeightAge=
#PriorityWeightFairshare=
#PriorityWeightJobSize=
#PriorityWeightPartition=
#PriorityWeightQOS=
#
#
# LOGGING AND ACCOUNTING
#AccountingStorageEnforce=0
#AccountingStorageHost=
#AccountingStoragePass=
#AccountingStoragePort=
#AccountingStorageType=
#AccountingStorageUser=
#AccountingStoreFlags=
#JobCompHost=
#JobCompLoc=
#JobCompParams=
#JobCompPass=
#JobCompPort=
JobCompType=jobcomp/none
#JobCompUser=
#JobContainerType=
JobAcctGatherFrequency=30
#JobAcctGatherType=
SlurmctldDebug=info
SlurmctldLogFile=/var/log/slurmctld.log
SlurmdDebug=info
SlurmdLogFile=/var/log/slurmd.log
#SlurmSchedLogFile=
#SlurmSchedLogLevel=
#DebugFlags=
#
#
# POWER SAVE SUPPORT FOR IDLE NODES (optional)
#SuspendProgram=
#ResumeProgram=
#SuspendTimeout=
#ResumeTimeout=
#ResumeRate=
#SuspendExcNodes=
#SuspendExcParts=
#SuspendRate=
#SuspendTime=
#
#
# COMPUTE NODES
NodeName=prometeo CPUs=16 RealMemory=64137 Sockets=1 CoresPerSocket=8 ThreadsPerCore=2 Gres=gpu:1 State=UNKNOWN
NodeName=Reineta CPUs=16 RealMemory=80281 Sockets=1 CoresPerSocket=8 ThreadsPerCore=2 Gres=gpu:1 State=UNKNOWN
NodeName=crio NodeAddr=172.17.30.29 CPUs=8 RealMemory=32048 Sockets=1 CoresPerSocket=4 ThreadsPerCore=2 State=UNKNOWN
NodeName=atlas NodeAddr=172.17.30.19 Gres=gpu:1 CPUs=8 RealMemory=32050 Sockets=1 CoresPerSocket=4 ThreadsPerCore=2 State=UNKNOWN
PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP
PartitionName=gpu Nodes=Reineta,atlas Default=NO MaxTime=INFINITE State=UP
This is my gres.conf file:
NodeName=Reineta Name=gpu File=/dev/nvidia0
NodeName=prometeo Name=gpu File=/dev/nvidia0
NodeName=atlas Name=gpu File=/dev/nvidia0
I have run some basics commands in order use the gpu:
scontrol show node
NodeName=Reineta Arch=x86_64 CoresPerSocket=8
CPUAlloc=0 CPUTot=16 CPULoad=7.98
AvailableFeatures=(null)
ActiveFeatures=(null)
Gres=gpu:1
NodeAddr=Reineta NodeHostName=Reineta Version=19.05.5
OS=Linux 5.15.0-127-generic #137~20.04.1-Ubuntu SMP Fri Nov 15 14:46:54 UTC 2024
RealMemory=80281 AllocMem=0 FreeMem=17212 Sockets=1 Boards=1
State=IDLE ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
Partitions=debug,gpu
BootTime=2024-12-30T15:16:42 SlurmdStartTime=2025-01-06T16:10:57
CfgTRES=cpu=16,mem=80281M,billing=16
AllocTRES=
CapWatts=n/a
CurrentWatts=0 AveWatts=0
ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
NodeName=atlas Arch=x86_64 CoresPerSocket=4
CPUAlloc=0 CPUTot=8 CPULoad=1.46
AvailableFeatures=(null)
ActiveFeatures=(null)
Gres=gpu:1
NodeAddr=172.17.30.19 NodeHostName=atlas Version=19.05.5
OS=Linux 5.15.0-125-generic #135~20.04.1-Ubuntu SMP Mon Oct 7 13:56:22 UTC 2024
RealMemory=32050 AllocMem=0 FreeMem=11249 Sockets=1 Boards=1
State=IDLE ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
Partitions=debug,gpu
BootTime=2024-12-12T10:55:53 SlurmdStartTime=2025-01-06T16:11:20
CfgTRES=cpu=8,mem=32050M,billing=8
AllocTRES=
CapWatts=n/a
CurrentWatts=0 AveWatts=0
ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
NodeName=crio Arch=x86_64 CoresPerSocket=4
CPUAlloc=0 CPUTot=8 CPULoad=0.00
AvailableFeatures=(null)
ActiveFeatures=(null)
Gres=(null)
NodeAddr=172.17.30.29 NodeHostName=crio Version=19.05.5
OS=Linux 5.15.0-127-generic #137~20.04.1-Ubuntu SMP Fri Nov 15 14:46:54 UTC 2024
RealMemory=32048 AllocMem=0 FreeMem=29934 Sockets=1 Boards=1
State=IDLE+DRAIN ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
Partitions=debug
BootTime=2025-01-02T17:05:54 SlurmdStartTime=2025-01-02T17:20:19
CfgTRES=cpu=8,mem=32048M,billing=8
AllocTRES=
CapWatts=n/a
CurrentWatts=0 AveWatts=0
ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
Reason=Low RealMemory [root@2025-01-02T17:21:55]
NodeName=prometeo Arch=x86_64 CoresPerSocket=8
CPUAlloc=0 CPUTot=16 CPULoad=0.67
AvailableFeatures=(null)
ActiveFeatures=(null)
Gres=gpu:1
NodeAddr=prometeo NodeHostName=prometeo Version=19.05.5
OS=Linux 5.15.0-125-generic #135~20.04.1-Ubuntu SMP Mon Oct 7 13:56:22 UTC 2024
RealMemory=64137 AllocMem=0 FreeMem=17236 Sockets=1 Boards=1
State=IDLE+DRAIN ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
Partitions=debug
BootTime=2024-12-12T15:04:19 SlurmdStartTime=2025-01-02T17:24:45
CfgTRES=cpu=16,mem=64137M,billing=16
AllocTRES=
CapWatts=n/a
CurrentWatts=0 AveWatts=0
ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
Reason=debug [root@2025-01-06T10:08:43]
Here it seems to be alright, but when I try to run jobs with gpu, it does not work properly:
This works
srun --nodes=2 --ntasks=2 nvidia-smi
Mon Jan 6 16:29:04 2025
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01 Driver Version: 535.183.01 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
Mon Jan 6 16:29:04 2025
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01 Driver Version: 535.183.01 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 3080 Ti Off | 00000000:01:00.0 On | N/A |
| 0% 55C P8 40W / 350W | 826MiB / 12288MiB | 1% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| 0 N/A N/A 2269 G /usr/lib/xorg/Xorg 51MiB |
| 0 N/A N/A 3009 G /usr/lib/xorg/Xorg 230MiB |
| 0 N/A N/A 3138 G /usr/bin/gnome-shell 93MiB |
| 0 N/A N/A 1165905 G ...erProcess --variations-seed-version 29MiB |
| 0 N/A N/A 2457722 C python3 390MiB |
+---------------------------------------------------------------------------------------+
| 0 NVIDIA GeForce GTX 1080 Off | 00000000:01:00.0 Off | N/A |
| 0% 48C P8 11W / 240W | 30MiB / 8192MiB | 0% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| 0 N/A N/A 1028 G /usr/lib/xorg/Xorg 18MiB |
| 0 N/A N/A 1100 G /usr/bin/gnome-shell 7MiB |
+---------------------------------------------------------------------------------------+
But this does not work:
srun --partition=gpu --nodes=2 --gres=gpu:1 nvidia-smi
srun: error: Unable to allocate resources: Requested node configuration is not available
Thanks in advance!
I have tried to configure gres.conf but it doesn't worked. I need to run my gpu cluster.
Upvotes: 0
Views: 65