Reputation: 402
I am setting up dask using the instruction here, creating a config.yaml to set my variables etc. I am currently trying to set the lifetime
of my workers to be 1hr
with restarts
as I am having memory leak. Looking at the value.yaml
, https://github.com/dask/helm-chart/blob/master/dask/values.yaml I have tried to set it up as below:
worker:
replicas: 8
extraArgs:
- "--lifetime 1hr --lifetime-restart --lifetime-stagger 5m"
However when I check my config, lifetime still doesn't appear set:
import dask
import dask.distributed
from distributed import Client
client = Client()
dask.config.config
The output:
{'jupyter_port_80_tcp': 'tcp://10.0.72.11:80',
'jupyter_port': 'tcp://10.0.72.11:80',
'scheduler_port_80_tcp_proto': 'tcp',
'jupyter_service_port': 80,
'jupyter_service_host': '10.0.72.11',
'jupyter_port_80_tcp_port': 80,
'scheduler_port_80_tcp_addr': '10.0.250.48',
'scheduler_address': 'dask-scheduler:8786',
'scheduler_service_port_dask_webui': 80,
'scheduler_port_8786_tcp_port': 8786,
'scheduler_service_port': 8786,
'scheduler_port_80_tcp': 'tcp://10.0.250.48:80',
'jupyter_service_port_dask_jupyter': 80,
'scheduler_port_8786_tcp': 'tcp://10.0.250.48:8786',
'scheduler_port_80_tcp_port': 80,
'jupyter_port_80_tcp_addr': '10.0.72.11',
'scheduler_port': 'tcp://10.0.250.48:8786',
'jupyter_port_80_tcp_proto': 'tcp',
'scheduler_service_port_dask_scheduler': 8786,
'scheduler_service_host': '10.0.250.48',
'scheduler_port_8786_tcp_proto': 'tcp',
'scheduler_port_8786_tcp_addr': '10.0.250.48',
'temporary-directory': None,
'dataframe': {'shuffle-compression': None},
'array': {'svg': {'size': 120}, 'slicing': {'split-large-chunks': None}},
'optimization': {'fuse': {'active': True,
'ave-width': 1,
'max-width': None,
'max-height': inf,
'max-depth-new-edges': None,
'subgraphs': None,
'rename-keys': True}},
'distributed': {'version': 2,
'scheduler': {'allowed-failures': 3,
'bandwidth': 100000000,
'blocked-handlers': [],
'default-data-size': '1kiB',
'events-cleanup-delay': '1h',
'idle-timeout': None,
'transition-log-length': 100000,
'work-stealing': True,
'work-stealing-interval': '100ms',
'worker-ttl': None,
'pickle': True,
'preload': [],
'preload-argv': [],
'unknown-task-duration': '500ms',
'default-task-durations': {'rechunk-split': '1us', 'shuffle-split': '1us'},
'validate': False,
'dashboard': {'status': {'task-stream-length': 1000},
'tasks': {'task-stream-length': 100000},
'tls': {'ca-file': None, 'key': None, 'cert': None},
'bokeh-application': {'allow_websocket_origin': ['*'],
'keep_alive_milliseconds': 500,
'check_unused_sessions_milliseconds': 500}},
'locks': {'lease-validation-interval': '10s', 'lease-timeout': '30s'},
'http': {'routes': ['distributed.http.scheduler.prometheus',
'distributed.http.scheduler.info',
'distributed.http.scheduler.json',
'distributed.http.health',
'distributed.http.proxy',
'distributed.http.statics']}},
'worker': {'blocked-handlers': [],
'multiprocessing-method': 'spawn',
'use-file-locking': True,
'connections': {'outgoing': 50, 'incoming': 10},
'preload': [],
'preload-argv': [],
'daemon': True,
'validate': False,
'lifetime': {'duration': None, 'stagger': '0 seconds', 'restart': False},
'profile': {'interval': '10ms', 'cycle': '1000ms', 'low-level': False},
'memory': {'target': 0.6, 'spill': 0.7, 'pause': 0.8, 'terminate': 0.95},
'http': {'routes': ['distributed.http.worker.prometheus',
'distributed.http.health',
'distributed.http.statics']}},
'nanny': {'preload': [], 'preload-argv': []},
'client': {'heartbeat': '5s', 'scheduler-info-interval': '2s'},
'deploy': {'lost-worker-timeout': '15s', 'cluster-repr-interval': '500ms'},
'adaptive': {'interval': '1s',
'target-duration': '5s',
'minimum': 0,
'maximum': inf,
'wait-count': 3},
'comm': {'retry': {'count': 0, 'delay': {'min': '1s', 'max': '20s'}},
'compression': 'auto',
'offload': '10MiB',
'default-scheme': 'tcp',
'socket-backlog': 2048,
'recent-messages-log-length': 0,
'zstd': {'level': 3, 'threads': 0},
'timeouts': {'connect': '10s', 'tcp': '30s'},
'require-encryption': None,
'tls': {'ciphers': None,
'ca-file': None,
'scheduler': {'cert': None, 'key': None},
'worker': {'key': None, 'cert': None},
'client': {'key': None, 'cert': None}}},
'dashboard': {'link': '{scheme}://{host}:{port}/status',
'export-tool': False,
'graph-max-items': 5000},
'admin': {'tick': {'interval': '20ms', 'limit': '3s'},
'max-error-length': 10000,
'log-length': 10000,
'log-format': '%(name)s - %(levelname)s - %(message)s',
'pdb-on-err': False}},
'rmm': {'pool-size': None},
'ucx': {'tcp': None,
'nvlink': None,
'infiniband': None,
'rdmacm': None,
'cuda_copy': None,
'net-devices': None,
'reuse-endpoints': True},
'scheduler': 'dask.distributed',
'shuffle': 'tasks'}
In addition, when I try locally, its also doesn't appear to work:
import dask
import dask.distributed
from distributed import Client
client = Client(n_workers=8, lifetime="1 hour", lifetime_restart=True)
dask.config.config
The output:
{'temporary-directory': None,
'dataframe': {'shuffle-compression': None},
'array': {'svg': {'size': 120}, 'slicing': {'split-large-chunks': None}},
'optimization': {'fuse': {'active': True,
'ave-width': 1,
'max-width': None,
'max-height': inf,
'max-depth-new-edges': None,
'subgraphs': None,
'rename-keys': True}},
'distributed': {'version': 2,
'scheduler': {'allowed-failures': 3,
'bandwidth': 100000000,
'blocked-handlers': [],
'default-data-size': '1kiB',
'events-cleanup-delay': '1h',
'idle-timeout': None,
'transition-log-length': 100000,
'work-stealing': True,
'work-stealing-interval': '100ms',
'worker-ttl': None,
'pickle': True,
'preload': [],
'preload-argv': [],
'unknown-task-duration': '500ms',
'default-task-durations': {'rechunk-split': '1us', 'shuffle-split': '1us'},
'validate': False,
'dashboard': {'status': {'task-stream-length': 1000},
'tasks': {'task-stream-length': 100000},
'tls': {'ca-file': None, 'key': None, 'cert': None},
'bokeh-application': {'allow_websocket_origin': ['*'],
'keep_alive_milliseconds': 500,
'check_unused_sessions_milliseconds': 500}},
'locks': {'lease-validation-interval': '10s', 'lease-timeout': '30s'},
'http': {'routes': ['distributed.http.scheduler.prometheus',
'distributed.http.scheduler.info',
'distributed.http.scheduler.json',
'distributed.http.health',
'distributed.http.proxy',
'distributed.http.statics']}},
'worker': {'blocked-handlers': [],
'multiprocessing-method': 'spawn',
'use-file-locking': True,
'connections': {'outgoing': 50, 'incoming': 10},
'preload': [],
'preload-argv': [],
'daemon': True,
'validate': False,
'lifetime': {'duration': None, 'stagger': '0 seconds', 'restart': False},
'profile': {'interval': '10ms', 'cycle': '1000ms', 'low-level': False},
'memory': {'target': 0.6, 'spill': 0.7, 'pause': 0.8, 'terminate': 0.95},
'http': {'routes': ['distributed.http.worker.prometheus',
'distributed.http.health',
'distributed.http.statics']}},
'nanny': {'preload': [], 'preload-argv': []},
'client': {'heartbeat': '5s', 'scheduler-info-interval': '2s'},
'deploy': {'lost-worker-timeout': '15s', 'cluster-repr-interval': '500ms'},
'adaptive': {'interval': '1s',
'target-duration': '5s',
'minimum': 0,
'maximum': inf,
'wait-count': 3},
'comm': {'retry': {'count': 0, 'delay': {'min': '1s', 'max': '20s'}},
'compression': 'auto',
'offload': '10MiB',
'default-scheme': 'tcp',
'socket-backlog': 2048,
'recent-messages-log-length': 0,
'zstd': {'level': 3, 'threads': 0},
'timeouts': {'connect': '10s', 'tcp': '30s'},
'require-encryption': None,
'tls': {'ciphers': None,
'ca-file': None,
'scheduler': {'cert': None, 'key': None},
'worker': {'key': None, 'cert': None},
'client': {'key': None, 'cert': None}}},
'dashboard': {'link': '{scheme}://{host}:{port}/status',
'export-tool': False,
'graph-max-items': 5000},
'admin': {'tick': {'interval': '20ms', 'limit': '3s'},
'max-error-length': 10000,
'log-length': 10000,
'log-format': '%(name)s - %(levelname)s - %(message)s',
'pdb-on-err': False}},
'rmm': {'pool-size': None},
'ucx': {'tcp': None,
'nvlink': None,
'infiniband': None,
'rdmacm': None,
'cuda_copy': None,
'net-devices': None,
'reuse-endpoints': True},
'scheduler': 'dask.distributed',
'shuffle': 'tasks'}
Any advice?
Upvotes: 1
Views: 1188
Reputation: 26
In the class definition the lifetime_restart argument has to be set to a bool, so I would think this would be:
--lifetime_restart True
Upvotes: 1