Alita Brown
Alita Brown

Reputation: 11

td-agent ruby process has high cpu consumption

td-agent ruby process running at 100% CPU. Agent works and outputs expected results but system resources being consumed are slowing the machine down.

restarting the agent through systemctl does not work as the process does not end and the service gets stuck in "stopping". process also does not respond to SIGTERM or SIGKILL with killall or pkill. Reboot does not fix, as soon as the process starts on boot it jumps to 100% again. (I should add that this identical config and permission scheme is running on 125+ other machines and only 2 have the issue)

top output

MY CONFIG

## match tag=debug.** and dump to console
<match debug.**>
  @type stdout
  @id output_stdout
</match>

####
## HTTP Source:

# HTTP input
# POST http://localhost:8888/<tag>?json=<json>
# POST http://localhost:8888/td.myapp.login?json={"user"%3A"me"}
# @see http://docs.fluentd.org/articles/in_http
<source>
  @type http
  @id input_http
  port 8888
</source>

####
## live debugging agent:
<source>
  @type debug_agent
  @id input_debug_agent
  bind 127.0.0.1
  port 24230
</source>

####
##ALL LOGS

##input

#Source
<source>
  @type tail
  @id input_tail_all
  <parse>
    @type none
  </parse>
  path /var/log/*log
  pos_file /var/log/td-agent/tmp/all.log.pos
  pos_file_compaction_interval 72h
  enable_stat_watcher false
  tag td.*
</source>

##Clean

#silence datadog
<filter td.var.log.syslog>
  @type grep
  <and>
    <exclude>
      key message
      pattern pkg/collector/python/datadog_agent
    </exclude>
  </and>
</filter>

<filter td.var.log.syslog>
  @type grep
  <and>
    <exclude>
      key message
      pattern chef-client
    </exclude>
  </and>
</filter>

############ This portion is only used for testing. Uncomment if you need to output all logs flagged by td-agent  #############
# ##output
# <match td.var.**>
#   @type file
#   @id output_file
#   <buffer>
#     @type file
#     timekey 1h
#     timekey_use_utc true
#   </buffer>
#   path /var/log/fluent/all_logs/
# </match>
################################################################################################################################

####
##User Logins

#Source files
<source>
  @type tail
  @id in_tail_logins
  <parse>
    @type syslog
    parser_type regexp
  </parse>
  path /var/log/auth.log
  pos_file /var/log/td-agent/tmp/auth.log.pos
  pos_file_compaction_interval 72h
  enable_stat_watcher false
  tag td.auth
</source>


##clean

#silence docker
<filter td.auth>
  @type grep
  <exclude>
    key message
    pattern docker
  </exclude>
</filter>

##tag sub-types
#
<match td.auth>
  @type rewrite_tag_filter
  <rule>
    key message
    pattern /COMMAND/
    tag sudo
  </rule>
  <rule>
    key message
    pattern /Accepted.publickey.+ssh.+/
    tag ssh
  </rule>
  <rule>
    key message
    pattern /.nx\:session.*session.opened+/
    tag nomachine
  </rule>
  <rule>
    key message
    pattern /session.closed.for.user+/
    tag logout
  </rule>
</match>

##transform output

#transform sudo
<filter sudo>
  @type parser
  key_name message
  <parse>
    @type regexp
    expression /(?<user>[^ ]+)[^ ]* [^ ]* [^ ]* [^ ]* [^ ]* ; USER=(?<sudoer>[^ ]+) ; COMMAND=(?<command>.*)$/
  </parse>
</filter>

#transform ssh
<filter ssh>
  @type parser
  key_name message
  <parse>
    @type regexp
    expression /(?<user>(?<=for ).[^ ]+)/
  </parse>
</filter>
<filter ssh>
  @type record_transformer
  <record>
    login-type ssh
  </record>
</filter>

#transform nomachine
<filter nomachine>
  @type parser
  key_name message
  <parse>
    @type regexp
    expression /(?<user>(?<=for.user ).[^ ]+)/
  </parse>
</filter>
<filter nomachine>
  @type record_transformer
  <record>
    login-type nomachine
  </record>
</filter>

#transform logout
<filter logout>
  @type grep
  <exclude>
    key message
    pattern /root/
  </exclude>
</filter>
<filter logout>
  @type grep
  <exclude>
    key message
    pattern /cron/
  </exclude>
</filter>
<filter logout>
  @type grep
  <exclude>
    key message
    pattern /su/
  </exclude>
</filter>
<filter logout>
  @type parser
  key_name message
  <parse>
    @type regexp
    expression /(?<user>(?<=for.user ).[^ ]+)/
  </parse>
</filter>
<filter logout>
  @type record_transformer
  <record>
    login-type logout
  </record>
</filter>


##output

#output sudo
<match sudo>
  @type file
  @id auth_output_file
  <buffer>
    timekey 1d
    timekey_use_utc true
  </buffer>
  path /var/log/fluent/sudo/
</match>

#output ssh nomachine
<match ssh nomachine logout>
  @type file
  @id ssh_login_output
  <buffer>
    timekey 1d
    timekey_use_utc true
  </buffer>
  path /var/log/fluent/logins/
</match>

STRACE sample

read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
futex(0x7f5700328e94, FUTEX_WAKE_PRIVATE, 1) = 1
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
futex(0x7f5700328e90, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f570781ea80, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x7f570781ea10, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x7f570781ea10, FUTEX_WAKE_PRIVATE, 1) = 0
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
futex(0x7f5700328e94, FUTEX_WAKE_PRIVATE, 1) = 1
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
futex(0x7f570781ea10, FUTEX_WAIT_PRIVATE, 2, NULL) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x7f570781ea10, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x7f570781ea10, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x7f570781ea84, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f570781ea10, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f5700328e90, FUTEX_WAKE_PRIVATE, 1) = 1
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
futex(0x7f5700328e94, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f570781ea80, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x7f570781ea10, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x7f570781ea10, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x7f5700328e90, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f570781ea10, FUTEX_WAKE_PRIVATE, 1) = 1
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
futex(0x7f5707858214, FUTEX_WAIT_PRIVATE, 0, {tv_sec=0, tv_nsec=31753510}) = 0
futex(0x7f570781ea10, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x7f570781ea84, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f570781ea10, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f5700328e94, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f570781ea10, FUTEX_WAKE_PRIVATE, 1) = 1
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
futex(0x7f5700328b90, FUTEX_WAKE_PRIVATE, 1) = 1
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
futex(0x7f5700328b94, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f570781ea80, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x7f570781ea10, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x7f5707858214, FUTEX_WAIT_PRIVATE, 0, {tv_sec=0, tv_nsec=41667651}) = 0
futex(0x7f570781ea10, FUTEX_WAKE_PRIVATE, 1) = 0
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
futex(0x7f5700328e90, FUTEX_WAKE_PRIVATE, 1) = 1
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
futex(0x7f5700328e94, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f570781ea84, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x7f570781ea10, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x7f570781ea10, FUTEX_WAKE_PRIVATE, 1) = 0
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
read(29, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192

Upvotes: 0

Views: 510

Answers (1)

Alita Brown
Alita Brown

Reputation: 11

Solved. A test configuration at the top of the file was not commented out properly. td-agent was reading everything in /var/log that it had permission for.

Upvotes: 1

Related Questions