Issues with logstash sustained throughput

Ryan Bellows Thu, 10 Apr 2014 12:12:39 -0700

Hi All,
Since upgrading to 1.4 (and ES 1.1) I've had no luck at all consuming logs, 
with events averaging about 10k a second, peak around 15k. What happens is 
Logstash processing cruises right along for a short while, then (without 
any errors or otherwise obvious reasons) the redis input queue starts to 
back up. Eventually redis will use up to it's memory limit and the shippers 
stop sending logs. Once the queue is full Logstash cant ever catch up, and 
I have to wipe out the queue and start over. Even with zero shippers 
sending logs to redis, I can watch the queue length and see that Logstash, 
when it gets to this state, is only handling a few thousand events every 
few seconds, extremely slow.

The underlying infrastructure is pretty robust, here are the specs:

Indexer1: 2x6core cpus, 24gb memory
Indexer2: 2x8core cpus, 64gb memory

ES cluster:

3 x 64gb machines with 2x8core cpus, 64gb memory, 31gb for ES heap, all SSD
drives.

The ES machines should easily be able to handle what I'm throwing at them,
and load/iops/etc are well within limits at all times. So I don't think the
problem is ES.

I've got the 2 logstash indexer machines pulling from two different redis
queues, one on each host. Indexer2 handles just a single log which is high
volume. Indexer1 handles lower volume but several logs and more Logstash
processing. I was thinking of switching them up to throw more CPU at the
heavier-processing logstash instance, but honestly I doubt it would help
with the backups/througput issues as graphs from both machines show they
are not stressed for CPU at all. In fact load is pretty low, Logstash just
isn't using the cpus much once it gets backed up.

Attached are my two indexer configs. I've tried bumping redis input_threads
up but this has little effect, same with batch_size. Both indexers are
started with -w (number of cores -1), and have 8gb heap configured for
Logstash.

host os:

CentOS 6.2 with latest kernel
Sun JRE 1.7.0_45
Logstash 1.4.0-1_c82dc09
ES 1.1.0

Here are some graphs showing the redis queue:
<https://lh4.googleusercontent.com/-t6wTedvMHLY/U0bq7nejaqI/AAAAAAAAACo/NbAjNnHuBAI/s1600/indexer1.png><https://lh5.googleusercontent.com/-L-du00g5mAs/U0bq-GtSFrI/AAAAAAAAACw/LJEbtGXnQ34/s1600/indexer2.png>

And CPU usage during the same timeframe:
<https://lh5.googleusercontent.com/-EHhbNorLiMY/U0bsqBGSE6I/AAAAAAAAADM/f21iu_46e_M/s1600/indexer2_cpu.png><https://lh3.googleusercontent.com/-e4TAQXmAdK4/U0bsZuA2Z3I/AAAAAAAAAC8/u3G7BzNkGWE/s1600/indexer1_cpu.png>

Attached are my two indexer configs.

Really frustrated by this issue, I've tried everything I can think of but
the end result is always the same. Any help or thoughts would be greatly
appreciated.

-Ryan

--
You received this message because you are subscribed to the Google Groups
"elasticsearch" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To view this discussion on the web visit
https://groups.google.com/d/msgid/elasticsearch/f100879e-7769-4747-ba9d-83265a1de996%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

input {
  redis {
    host      => "log1"
    data_type => "list"
    key       => "sfo1-logstash"
    batch_count  => 100
    threads  => 8
  }
}

filter {

## httpd pages
  if [type] == "sfo1-httpd-access-pages" {
    grok { match => [ "message", "%{IPORHOST:website} 
(?:%{IPORHOST:requester_ip}|-) %{DATA:remote_logname} %{DATA:remote_user} 
\[%{HTTPDATE:http_timestamp}\] %{QS:http_request} %{NUMBER:http_status} 
(?:%{NUMBER:response_bytes}|-) %{QS:referrer} %{QS:user_agent} %{QS:WCID_WUID} 
%{QS:WPOS} %{QS:WTST}" ]
         add_tag => "httpd_access"
         add_tag => "pages"
         add_tag => "sfo1"
    }
    useragent { 
      source => "user_agent" 
    }
    if [user_agent] =~ "a10hm\/1\.0" {
      drop { }
    }
    date { 
      match => [ "http_timestamp", "dd/MMM/yyyy:HH:mm:ss Z" ] 
    }
  }

## httpd editor
  if [type] == "sfo1-httpd-access-editor" {
    grok { match => [ "message", "%{IPORHOST:website} 
(?:%{IPORHOST:requester_ip}|-) %{DATA:remote_logname} %{DATA:remote_user} 
\[%{HTTPDATE:http_timestamp}\] %{QS:http_request} %{NUMBER:http_status} 
(?:%{NUMBER:response_bytes}|-) %{QS:referrer} %{QS:user_agent} %{QS:WCID_WUID} 
%{QS:WPOS} %{QS:WTST}" ]
         add_tag => "httpd_access"
         add_tag => "editor"
         add_tag => "sfo1"
    }
    useragent { 
      source => "user_agent" 
    }
    if [user_agent] =~ "a10hm\/1\.0" {
      drop { }
    }
    date { 
      match => [ "http_timestamp", "dd/MMM/yyyy:HH:mm:ss Z" ] 
    }
  }

## php errors pages
  if [type] == "sfo1-php-errors-pages" {
    mutate { add_tag => "php_errors"
             add_tag => "pages"
             add_tag => "sfo1"
          remove_tag => "multiline"
    }
  }

## php errors editor
  if [type] == "sfo1-php-errors-editor" {
    mutate { add_tag => "php_errors"
             add_tag => "editor"
             add_tag => "sfo1"
          remove_tag => "multiline"
    }
  }

## modsec pages
  if [type] == "sfo1-modsec-pages" {
    mutate { add_tag => "modsecurity"
             add_tag => "pages"
             add_tag => "sfo1"
          remove_tag => "multiline"
    }
    if [ "message" ] =~ "Error reading request body: Connection reset by peer"  
{
      mutate {
        add_tag => "modsecurity_conn_reset"
      }
    }
  }

## modsec editor
  if [type] == "sfo1-modsec-editor" {
    mutate { add_tag => "modsecurity"
             add_tag => "editor"
             add_tag => "sfo1"
          remove_tag => "multiline"
    }
    if [ "message" ] =~ "Error reading request body: Connection reset by peer"  
{
      mutate {
        add_tag => "modsecurity_conn_reset"
      }
    }
  }

# rabbitmq consumers
  if [type] == "sfo1-rabbitmq-consumers" {
    grok { match => [ "message", "%{WORD:field1}:%{NUMBER:eventIdnumber} 
%{WORD:field2}:%{HOST:rabbitmq_host} %{WORD:field3}:%{WORD:application} 
%{GREEDYDATA:message_remainder}" ]
           remove_field => [ "field1" ]
           remove_field => [ "field2" ]
           remove_field => [ "field3" ]
           remove_field => [ "rabbitmq_host" ]
           remove_field => [ "message_remainder" ]
                add_tag => [ "%{application}" ]
    }
    mutate { add_tag => "rabbitmq_consumers"
             add_tag => "sfo1"
          remove_tag => "multiline"
    }
  }
}

output {
  elasticsearch {
       cluster => "elasticsearch-sfo1"
         index => "logstash-sfo1-%{+YYYY.MM.dd}"
       workers => 4
    flush_size => 5000
  }
}

input {
  redis {
    host      => "log2"
    data_type => "list"
    key       => "sfo1-logstash-nginx"
    batch_count  => 100
    threads  => 8
  }
}

filter {

## nginx
  if [type] == "sfo1-nginx-access-pages" {
    grok { match => [ "message", "%{IPORHOST:requester_ip} 
(?:%{IPORHOST:vip_ip}|-) \[%{HTTPDATE:http_timestamp}\] 
(?:%{DATA:website}|%{IPORHOST:website}) %{QS:http_request} 
%{NUMBER:http_status} %{NUMBER:response_bytes} %{QS:referrer} %{QS:user_agent}" 
]
           add_tag => "nginx_access"
           add_tag => "pages"
           add_tag => "sfo1"
    }
    useragent { 
      source => "user_agent" 
    }
    date { 
      match => [ "http_timestamp", "dd/MMM/yyyy:HH:mm:ss Z" ]
    }
  }
}

output {
  elasticsearch {
       cluster => "elasticsearch-sfo1"
         index => "logstash-sfo1-%{+YYYY.MM.dd}"
       workers => 4
    flush_size => 5000
  }
}

Issues with logstash sustained throughput

Reply via email to