AmyChodorowski
AmyChodorowski

Reputation: 402

Setting dask worker's lifetime parameter

I am setting up dask using the instruction here, creating a config.yaml to set my variables etc. I am currently trying to set the lifetime of my workers to be 1hr with restarts as I am having memory leak. Looking at the value.yaml, https://github.com/dask/helm-chart/blob/master/dask/values.yaml I have tried to set it up as below:

worker:
  replicas: 8
  extraArgs:
    - "--lifetime 1hr --lifetime-restart --lifetime-stagger 5m"

However when I check my config, lifetime still doesn't appear set:

import dask
import dask.distributed
from distributed import Client
client = Client()

dask.config.config

The output:

{'jupyter_port_80_tcp': 'tcp://10.0.72.11:80',
 'jupyter_port': 'tcp://10.0.72.11:80',
 'scheduler_port_80_tcp_proto': 'tcp',
 'jupyter_service_port': 80,
 'jupyter_service_host': '10.0.72.11',
 'jupyter_port_80_tcp_port': 80,
 'scheduler_port_80_tcp_addr': '10.0.250.48',
 'scheduler_address': 'dask-scheduler:8786',
 'scheduler_service_port_dask_webui': 80,
 'scheduler_port_8786_tcp_port': 8786,
 'scheduler_service_port': 8786,
 'scheduler_port_80_tcp': 'tcp://10.0.250.48:80',
 'jupyter_service_port_dask_jupyter': 80,
 'scheduler_port_8786_tcp': 'tcp://10.0.250.48:8786',
 'scheduler_port_80_tcp_port': 80,
 'jupyter_port_80_tcp_addr': '10.0.72.11',
 'scheduler_port': 'tcp://10.0.250.48:8786',
 'jupyter_port_80_tcp_proto': 'tcp',
 'scheduler_service_port_dask_scheduler': 8786,
 'scheduler_service_host': '10.0.250.48',
 'scheduler_port_8786_tcp_proto': 'tcp',
 'scheduler_port_8786_tcp_addr': '10.0.250.48',
 'temporary-directory': None,
 'dataframe': {'shuffle-compression': None},
 'array': {'svg': {'size': 120}, 'slicing': {'split-large-chunks': None}},
 'optimization': {'fuse': {'active': True,
   'ave-width': 1,
   'max-width': None,
   'max-height': inf,
   'max-depth-new-edges': None,
   'subgraphs': None,
   'rename-keys': True}},
 'distributed': {'version': 2,
  'scheduler': {'allowed-failures': 3,
   'bandwidth': 100000000,
   'blocked-handlers': [],
   'default-data-size': '1kiB',
   'events-cleanup-delay': '1h',
   'idle-timeout': None,
   'transition-log-length': 100000,
   'work-stealing': True,
   'work-stealing-interval': '100ms',
   'worker-ttl': None,
   'pickle': True,
   'preload': [],
   'preload-argv': [],
   'unknown-task-duration': '500ms',
   'default-task-durations': {'rechunk-split': '1us', 'shuffle-split': '1us'},
   'validate': False,
   'dashboard': {'status': {'task-stream-length': 1000},
    'tasks': {'task-stream-length': 100000},
    'tls': {'ca-file': None, 'key': None, 'cert': None},
    'bokeh-application': {'allow_websocket_origin': ['*'],
     'keep_alive_milliseconds': 500,
     'check_unused_sessions_milliseconds': 500}},
   'locks': {'lease-validation-interval': '10s', 'lease-timeout': '30s'},
   'http': {'routes': ['distributed.http.scheduler.prometheus',
     'distributed.http.scheduler.info',
     'distributed.http.scheduler.json',
     'distributed.http.health',
     'distributed.http.proxy',
     'distributed.http.statics']}},
  'worker': {'blocked-handlers': [],
   'multiprocessing-method': 'spawn',
   'use-file-locking': True,
   'connections': {'outgoing': 50, 'incoming': 10},
   'preload': [],
   'preload-argv': [],
   'daemon': True,
   'validate': False,
   'lifetime': {'duration': None, 'stagger': '0 seconds', 'restart': False},
   'profile': {'interval': '10ms', 'cycle': '1000ms', 'low-level': False},
   'memory': {'target': 0.6, 'spill': 0.7, 'pause': 0.8, 'terminate': 0.95},
   'http': {'routes': ['distributed.http.worker.prometheus',
     'distributed.http.health',
     'distributed.http.statics']}},
  'nanny': {'preload': [], 'preload-argv': []},
  'client': {'heartbeat': '5s', 'scheduler-info-interval': '2s'},
  'deploy': {'lost-worker-timeout': '15s', 'cluster-repr-interval': '500ms'},
  'adaptive': {'interval': '1s',
   'target-duration': '5s',
   'minimum': 0,
   'maximum': inf,
   'wait-count': 3},
  'comm': {'retry': {'count': 0, 'delay': {'min': '1s', 'max': '20s'}},
   'compression': 'auto',
   'offload': '10MiB',
   'default-scheme': 'tcp',
   'socket-backlog': 2048,
   'recent-messages-log-length': 0,
   'zstd': {'level': 3, 'threads': 0},
   'timeouts': {'connect': '10s', 'tcp': '30s'},
   'require-encryption': None,
   'tls': {'ciphers': None,
    'ca-file': None,
    'scheduler': {'cert': None, 'key': None},
    'worker': {'key': None, 'cert': None},
    'client': {'key': None, 'cert': None}}},
  'dashboard': {'link': '{scheme}://{host}:{port}/status',
   'export-tool': False,
   'graph-max-items': 5000},
  'admin': {'tick': {'interval': '20ms', 'limit': '3s'},
   'max-error-length': 10000,
   'log-length': 10000,
   'log-format': '%(name)s - %(levelname)s - %(message)s',
   'pdb-on-err': False}},
 'rmm': {'pool-size': None},
 'ucx': {'tcp': None,
  'nvlink': None,
  'infiniband': None,
  'rdmacm': None,
  'cuda_copy': None,
  'net-devices': None,
  'reuse-endpoints': True},
 'scheduler': 'dask.distributed',
 'shuffle': 'tasks'}

In addition, when I try locally, its also doesn't appear to work:

import dask
import dask.distributed
from distributed import Client
client = Client(n_workers=8, lifetime="1 hour", lifetime_restart=True)

dask.config.config

The output:

{'temporary-directory': None,
 'dataframe': {'shuffle-compression': None},
 'array': {'svg': {'size': 120}, 'slicing': {'split-large-chunks': None}},
 'optimization': {'fuse': {'active': True,
   'ave-width': 1,
   'max-width': None,
   'max-height': inf,
   'max-depth-new-edges': None,
   'subgraphs': None,
   'rename-keys': True}},
 'distributed': {'version': 2,
  'scheduler': {'allowed-failures': 3,
   'bandwidth': 100000000,
   'blocked-handlers': [],
   'default-data-size': '1kiB',
   'events-cleanup-delay': '1h',
   'idle-timeout': None,
   'transition-log-length': 100000,
   'work-stealing': True,
   'work-stealing-interval': '100ms',
   'worker-ttl': None,
   'pickle': True,
   'preload': [],
   'preload-argv': [],
   'unknown-task-duration': '500ms',
   'default-task-durations': {'rechunk-split': '1us', 'shuffle-split': '1us'},
   'validate': False,
   'dashboard': {'status': {'task-stream-length': 1000},
    'tasks': {'task-stream-length': 100000},
    'tls': {'ca-file': None, 'key': None, 'cert': None},
    'bokeh-application': {'allow_websocket_origin': ['*'],
     'keep_alive_milliseconds': 500,
     'check_unused_sessions_milliseconds': 500}},
   'locks': {'lease-validation-interval': '10s', 'lease-timeout': '30s'},
   'http': {'routes': ['distributed.http.scheduler.prometheus',
     'distributed.http.scheduler.info',
     'distributed.http.scheduler.json',
     'distributed.http.health',
     'distributed.http.proxy',
     'distributed.http.statics']}},
  'worker': {'blocked-handlers': [],
   'multiprocessing-method': 'spawn',
   'use-file-locking': True,
   'connections': {'outgoing': 50, 'incoming': 10},
   'preload': [],
   'preload-argv': [],
   'daemon': True,
   'validate': False,
   'lifetime': {'duration': None, 'stagger': '0 seconds', 'restart': False},
   'profile': {'interval': '10ms', 'cycle': '1000ms', 'low-level': False},
   'memory': {'target': 0.6, 'spill': 0.7, 'pause': 0.8, 'terminate': 0.95},
   'http': {'routes': ['distributed.http.worker.prometheus',
     'distributed.http.health',
     'distributed.http.statics']}},
  'nanny': {'preload': [], 'preload-argv': []},
  'client': {'heartbeat': '5s', 'scheduler-info-interval': '2s'},
  'deploy': {'lost-worker-timeout': '15s', 'cluster-repr-interval': '500ms'},
  'adaptive': {'interval': '1s',
   'target-duration': '5s',
   'minimum': 0,
   'maximum': inf,
   'wait-count': 3},
  'comm': {'retry': {'count': 0, 'delay': {'min': '1s', 'max': '20s'}},
   'compression': 'auto',
   'offload': '10MiB',
   'default-scheme': 'tcp',
   'socket-backlog': 2048,
   'recent-messages-log-length': 0,
   'zstd': {'level': 3, 'threads': 0},
   'timeouts': {'connect': '10s', 'tcp': '30s'},
   'require-encryption': None,
   'tls': {'ciphers': None,
    'ca-file': None,
    'scheduler': {'cert': None, 'key': None},
    'worker': {'key': None, 'cert': None},
    'client': {'key': None, 'cert': None}}},
  'dashboard': {'link': '{scheme}://{host}:{port}/status',
   'export-tool': False,
   'graph-max-items': 5000},
  'admin': {'tick': {'interval': '20ms', 'limit': '3s'},
   'max-error-length': 10000,
   'log-length': 10000,
   'log-format': '%(name)s - %(levelname)s - %(message)s',
   'pdb-on-err': False}},
 'rmm': {'pool-size': None},
 'ucx': {'tcp': None,
  'nvlink': None,
  'infiniband': None,
  'rdmacm': None,
  'cuda_copy': None,
  'net-devices': None,
  'reuse-endpoints': True},
 'scheduler': 'dask.distributed',
 'shuffle': 'tasks'}

Any advice?

Upvotes: 1

Views: 1188

Answers (1)

Paul H
Paul H

Reputation: 26

In the class definition the lifetime_restart argument has to be set to a bool, so I would think this would be: --lifetime_restart True

Upvotes: 1

Related Questions