A short followup: Our problem was caused by the bad TCP reading code in
the nsca server ("sleep 1" on EAGAIN). Although the code is still bad we
could fix our problem by just using usec(5000) instead of sleep(1).

If others are interested too, I attached 2 small patches. The patches
are against nsca-2.9.1.

netutils.c.2.9.1-patch.txt: Fix long timeout on EAGAIN

nsca.c.2.9.1-patch.txt: Fix logging of TCP port and improve logging when
debug mode is activated (log more messages, log TCP file descriptor)

regards
Klaus

On 26.02.2015 17:07, Klaus Darilion wrote:
> Hi!
> 
> We are using Icinga with NSCA 2.9.1 in single-process mode with delivery
> to the checkresults directory.
> 
> It seems the single-process networking code is not optimal: reading from
> one TCP connection blocks all other TCP connections until NSCA finished
> reading all the data from the current TCP connection. Thus, reading data
> on a slow TCP connection causes delay for all other incoming passive checks.
> 
> I am a bit afraid switching to multi-process mode due to the high fork
> rate (sometimes we have up to 100 passive check results per second).
> 
> So, what do other people use for optimal NSCA performance?
> single-process or multi-process? Is it worth considering some NSCA
> replacement?
> 
> Thanks
> Klaus
> _______________________________________________
> icinga-users mailing list
> [email protected]
> https://lists.icinga.org/mailman/listinfo/icinga-users
> 
--- nsca-2.9.1/src/netutils.c   2015-03-04 09:48:37.000000000 +0000
+++ netutils.c  2015-03-04 09:47:15.000000000 +0000
@@ -254,7 +254,12 @@
                        time(&current_time);
                        if(current_time-start_time>timeout)
                                break;
-                       sleep(1);
+                       /* This code is rather bad, as we wait for the whole 
data from this certain
+                          client until we poll other connections again. Thus, 
if this TCP connection
+                          is slow, meanwhile all other existing and incoming 
TCP connections will not
+                          serviced.
+                          Let's wait some milliseconds and hope that the data 
meanwhile arrived. */
+                       usleep(5000);
                        continue;
                        }
 
--- nsca-2.9.1/src/nsca.c       2015-03-04 09:48:37.000000000 +0000
+++ nsca.c      2015-03-04 09:47:49.000000000 +0000
@@ -841,7 +841,7 @@
         syslog(LOG_NOTICE,"Starting up daemon");
 
         if(debug==TRUE){
-                syslog(LOG_DEBUG,"Listening for connections on port 
%d\n",htons(myname.sin_port));
+                syslog(LOG_INFO,"Listening for connections on port 
%d\n",htons(myname.sin_port));
                 }
 
        /* socket should be non-blocking for mult-process daemon */
@@ -977,7 +977,7 @@
 
         /* log info to syslog facility */
         if(debug==TRUE)
-                syslog(LOG_DEBUG,"Connection from %s port 
%d",inet_ntoa(nptr->sin_addr),nptr->sin_port);
+                syslog(LOG_INFO,"%d: Connection from %s port 
%d",new_sd,inet_ntoa(nptr->sin_addr),htons(nptr->sin_port));
 
        /* handle the connection */
        if(mode==SINGLE_PROCESS_DAEMON)
@@ -1004,7 +1004,7 @@
 
         /* log info to syslog facility */
         if(debug==TRUE)
-                syslog(LOG_INFO,"Handling the connection...");
+                syslog(LOG_INFO,"%d: Handling the connection...",sock);
 
         /* socket should be non-blocking */
         fcntl(sock,F_GETFL,&flags);
@@ -1029,7 +1029,7 @@
 
         /* there was an error sending the packet */
         if(rc==-1){
-                syslog(LOG_ERR,"Could not send init packet to client\n");
+                syslog(LOG_ERR,"%d: Could not send init packet to 
client\n",sock);
                 encrypt_cleanup(decryption_method,CI);
                 close(sock);
                if(mode==MULTI_PROCESS_DAEMON)
@@ -1039,7 +1039,7 @@
 
         /* for some reason we didn't send all the bytes we were supposed to */
        else if(bytes_to_send<sizeof(send_packet)){
-                syslog(LOG_ERR,"Only able to send %d of %d bytes of init 
packet to client\n",rc,sizeof(send_packet));
+                syslog(LOG_ERR,"%d: Only able to send %d of %d bytes of init 
packet to client\n",sock,rc,sizeof(send_packet));
                 encrypt_cleanup(decryption_method,CI);
                 close(sock);
                if(mode==MULTI_PROCESS_DAEMON)
@@ -1103,7 +1103,7 @@
                         }
                                else {
                         if(debug==TRUE)
-                                syslog(LOG_ERR,"End of connection...");
+                                syslog(LOG_INFO,"%d: End of 
connection...",sock);
                         encrypt_cleanup(decryption_method, CI);
                         close(sock);
                         if(mode==SINGLE_PROCESS_DAEMON)
@@ -1115,7 +1115,7 @@
 
         /* we couldn't read the correct amount of data, so bail out */
         if(bytes_to_recv!=packet_length){
-                syslog(LOG_ERR,"Data sent from client was too short (%d < %d), 
aborting...",bytes_to_recv,packet_length);
+                syslog(LOG_ERR,"%d: Data sent from client was too short (%d < 
%d), aborting...",sock,bytes_to_recv,packet_length);
                 encrypt_cleanup(decryption_method, CI);
                 close(sock);
                return;
@@ -1134,7 +1134,7 @@
 
         /* make sure this is the right type of packet */
         if(ntohs(receive_packet.packet_version)!=NSCA_PACKET_VERSION_3){
-                syslog(LOG_ERR,"Received invalid packet type/version from 
client - possibly due to client using wrong password or crypto algorithm?");
+                syslog(LOG_ERR,"%d: Received invalid packet type/version from 
client - possibly due to client using wrong password or crypto 
algorithm?",sock);
                /*return;*/
                close(sock);
                 if(mode==SINGLE_PROCESS_DAEMON)
@@ -1148,7 +1148,7 @@
         receive_packet.crc32_value=0L;
         calculated_crc32=calculate_crc32((char 
*)&receive_packet,packet_length);
         if(packet_crc32!=calculated_crc32){
-                syslog(LOG_ERR,"Dropping packet with invalid CRC32 - possibly 
due to client using wrong password or crypto algorithm?");
+                syslog(LOG_ERR,"%d: Dropping packet with invalid CRC32 - 
possibly due to client using wrong password or crypto algorithm?",sock);
                 /*return;*/
                close(sock);
                 if(mode==SINGLE_PROCESS_DAEMON)
@@ -1163,11 +1163,11 @@
 
         packet_age=(unsigned long)(current_time-packet_time);
         if(debug==TRUE)
-                  syslog(LOG_ERR,"Time difference in packet: %lu seconds for 
host %s", packet_age, host_name);
+                  syslog(LOG_DEBUG,"%d: Time difference in packet: %lu seconds 
for host %s", sock, packet_age, host_name);
         if((max_packet_age>0 && (packet_age>max_packet_age) && 
(packet_age>=0)) ||
                 ((max_packet_age>0) && (packet_age<(0-max_packet_age)) && 
(packet_age < 0))
         ){
-                syslog(LOG_ERR,"Dropping packet with stale timestamp for %s - 
packet was %lu seconds old.",host_name,packet_age);
+                syslog(LOG_ERR,"%d: Dropping packet with stale timestamp for 
%s - packet was %lu seconds old.",sock,host_name,packet_age);
                close(sock);
                 if(mode==SINGLE_PROCESS_DAEMON)
                         return;
@@ -1191,9 +1191,9 @@
         /* log info to syslog facility */
         if(debug==TRUE){
                if(!strcmp(svc_description,""))
-                       syslog(LOG_NOTICE,"HOST CHECK -> Host Name: '%s', 
Return Code: '%d', Output: '%s'",host_name,return_code,plugin_output);
+                       syslog(LOG_NOTICE,"%d: received HOST CHECK -> Host 
Name: '%s', Return Code: '%d', Output: 
'%s'",sock,host_name,return_code,plugin_output);
                else
-                       syslog(LOG_NOTICE,"SERVICE CHECK -> Host Name: '%s', 
Service Description: '%s', Return Code: '%d', Output: 
'%s'",host_name,svc_description,return_code,plugin_output);
+                       syslog(LOG_NOTICE,"%d: received SERVICE CHECK -> Host 
Name: '%s', Service Description: '%s', Return Code: '%d', Output: 
'%s'",sock,host_name,svc_description,return_code,plugin_output);
                }
 
         /* write the check result to the external command file.
@@ -1204,9 +1204,15 @@
          */
         //syslog(LOG_ERR,"'%s' (%s) []",check_result_path, 
strlen(check_result_path));
         if (check_result_path==NULL){
-        
write_check_result(host_name,svc_description,return_code,plugin_output,time(NULL));
+                
write_check_result(host_name,svc_description,return_code,plugin_output,time(NULL));
+                if(debug==TRUE){
+                        syslog(LOG_NOTICE,"%d: wrote check result to 
FIFO",sock);
+                }
         }else{
                 
write_checkresult_file(host_name,svc_description,return_code,plugin_output,time(NULL));
+                if(debug==TRUE){
+                        syslog(LOG_NOTICE,"%d: wrote check result to 
checkresult directory",sock);
+                }
         }
 
        return;
_______________________________________________
icinga-users mailing list
[email protected]
https://lists.icinga.org/mailman/listinfo/icinga-users

Reply via email to