I wanted to compare the 72_scores.cf file against the one from yesterday and the one from march. However the svn diff is not very readable and its very hard to see which rules were added or removed. I also wanted to compare the ranges.data file which also contains rules and score ranges.
I patched together a small script for comparing score files. See attchement for the (perl) script. It produces output like this (comparing march against last night): ./compare-rulefiles 72_scores_20170315.cf 72_scores-1815282.cf Only in 1 (removed in 2) AC_HTML_NONSENSE_TAGS AC_SPAMMY_URI_PATTERNS1 AC_SPAMMY_URI_PATTERNS10 AC_SPAMMY_URI_PATTERNS11 AC_SPAMMY_URI_PATTERNS12 AC_SPAMMY_URI_PATTERNS2 AC_SPAMMY_URI_PATTERNS3 AC_SPAMMY_URI_PATTERNS4 AC_SPAMMY_URI_PATTERNS8 AC_SPAMMY_URI_PATTERNS9 ADVANCE_FEE_2_NEW_FORM ADVANCE_FEE_4_NEW ADVANCE_FEE_5_NEW AXB_XMAILER_MIMEOLE_OL_1ECD5 AXB_XM_FORGED_OL2600 BODY_EMPTY CANT_SEE_AD CN_B2B_SPAMMER COMMENT_GIBBERISH ENCRYPTED_MESSAGE FBI_MONEY FBI_SPOOF FORM_LOW_CONTRAST FOUND_YOU FREEMAIL_DOC_PDF_BCC FROM_WORDY_SHORT FSL_HELO_BARE_IP_2 GOOGLE_DOCS_PHISH GOOGLE_DOCS_PHISH_MANY GOOG_MALWARE_DNLD HDRS_LCASE HEXHASH_WORD HK_SCAM_N15 HTML_OFF_PAGE LIST_PRTL_PUMPDUMP LIST_PRTL_SAME_USER LOTTO_AGENT LOTTO_DEPT LUCRATIVE MONEY_LOTTERY MSGID_NOFQDN1 MSM_PRIO_REPTO PHP_NOVER_MUA PHP_ORIG_SCRIPT PHP_SCRIPT_MUA PP_TOO_MUCH_UNICODE02 PP_TOO_MUCH_UNICODE05 PUMPDUMP PUMPDUMP_MULTI RAND_HEADER_MANY RP_MATCHES_RCVD SHARE_50_50 SPOOFED_FREEM_REPTO_CHN STOCK_LOW_CONTRAST STOCK_TIP SYSADMIN TO_NO_BRKTS_FROM_MSSP TW_GIBBERISH_MANY UC_GIBBERISH_OBFU URI_DATA URI_GOOGLE_PROXY URI_OPTOUT_3LD URI_PHISH XPRIO_SHORT_SUBJ Only in 2 (added in 2) ADVANCE_FEE_4_NEW_MONEY ADVANCE_FEE_5_NEW_FRM_MNY ADVANCE_FEE_5_NEW_MONEY APOSTROPHE_TOCC AXB_X_AOL_SEZ_S DEAR_BENEFICIARY FROM_MISSP_DYNIP FROM_MISSP_EH_MATCH FSL_HELO_FAKE FSL_MIME_NO_TEXT FUZZY_UNSUBSCRIBE HDRS_MISSP MANY_PILL_PRICE MILLION_USD MONEY_ATM_CARD MONEY_FORM_SHORT MONEY_FROM_41 SERGIO_SUBJECT_VIAGRA01 SHORTENED_URL_SRC SINGLETS_LOW_CONTRAST TO_NO_BRKTS_DYNIP Changed AC_BR_BONANZA 0.001 0.001 0.001 0.001 0.001 0.967 0.001 0.967 AC_DIV_BONANZA 0.001 0.001 0.001 0.001 0.909 0.001 0.909 0.001 ADVANCE_FEE_2_NEW_MONEY 1.997 0.001 1.997 0.001 0.001 0.039 0.001 0.039 ADVANCE_FEE_3_NEW 3.496 0.001 3.496 0.001 2.099 1.697 2.099 1.697 ADVANCE_FEE_3_NEW_MONEY 2.796 0.001 2.796 0.001 3.100 2.696 3.100 2.696 AXB_XMAILER_MIMEOLE_OL_024C2 0.367 0.001 0.367 0.001 0.001 2.822 0.001 2.822 BODY_URI_ONLY 0.998 0.001 0.998 0.001 3.500 3.595 3.500 3.595 BOGUS_MSM_HDRS 0.909 0.001 0.909 0.001 1.912 0.677 1.912 0.677 CK_HELO_DYNAMIC_SPLIT_IP 1.350 0.001 1.350 0.001 1.500 0.001 1.500 0.001 CK_HELO_GENERIC 0.249 0.249 0.249 0.249 2.195 1.350 2.195 1.350 DATE_IN_FUTURE_96_Q 3.296 3.299 3.296 3.299 2.999 2.696 2.999 2.696 FILL_THIS_FORM 2.748 0.001 2.748 0.001 0.200 1.402 0.200 1.402 FORM_FRAUD 0.998 0.001 0.998 0.001 3.099 1.841 3.099 1.841 FORM_FRAUD_3 2.696 0.001 2.696 0.001 2.899 0.383 2.899 0.383 FORM_FRAUD_5 0.209 0.001 0.209 0.001 1.531 1.802 1.531 1.802 FREEMAIL_FORGED_FROMDOMAIN 0.001 0.199 0.001 0.199 0.001 0.001 0.001 0.001 FROM_IN_TO_AND_SUBJ 0.287 0.262 0.287 0.262 1.001 0.001 1.001 0.001 FROM_MISSP_FREEMAIL 3.595 0.001 3.595 0.001 2.965 3.393 2.965 3.393 FROM_MISSP_MSFT 0.001 0.001 0.001 0.001 0.001 3.129 0.001 3.129 FROM_MISSP_REPLYTO 0.001 0.001 0.001 0.001 3.799 0.001 3.799 0.001 FROM_MISSP_SPF_FAIL 0.001 1.000 0.001 1.000 0.001 0.001 0.001 0.001 FROM_MISSP_TO_UNDISC 1.438 0.001 1.438 0.001 1.102 0.275 1.102 0.275 FROM_MISSP_USER 0.001 0.001 0.001 0.001 0.001 1.416 0.001 1.416 FROM_MISSP_XPRIO 0.001 0.001 0.001 0.001 0.001 3.595 0.001 3.595 FROM_WORDY 2.497 0.001 2.497 0.001 3.299 2.511 3.299 2.511 FSL_CTYPE_WIN1251 0.001 0.001 0.001 0.001 0.001 0.588 0.001 0.588 FSL_NEW_HELO_USER 0.083 0.001 0.083 0.001 0.001 0.755 0.001 0.755 HELO_MISC_IP 0.248 0.250 0.248 0.250 2.600 1.357 2.600 1.357 HK_RANDOM_FROM 0.998 0.001 0.998 0.001 1.501 2.664 1.501 2.664 HK_SCAM_N2 3.249 0.001 3.249 0.001 3.099 2.696 3.099 2.696 IMG_DIRECT_TO_MX 2.397 2.400 2.397 2.400 3.599 1.743 3.599 1.743 LONG_HEX_URI 2.194 2.290 2.194 2.290 0.399 0.884 0.399 0.884 LONG_IMG_URI 0.553 0.100 0.553 0.100 0.001 0.001 0.001 0.001 LOTS_OF_MONEY 0.001 0.001 0.001 0.001 0.001 0.006 0.001 0.006 MIMEOLE_DIRECT_TO_MX 1.445 0.381 1.445 0.381 2.399 0.737 2.399 0.737 MIME_NO_TEXT 1.000 1.000 1.000 1.000 3.505 2.941 3.505 2.941 MONEY_FRAUD_3 2.896 0.001 2.896 0.001 3.100 0.243 3.100 0.243 MONEY_FRAUD_5 3.096 0.001 3.096 0.001 3.400 2.896 3.400 2.896 MONEY_FRAUD_8 2.548 0.001 2.548 0.001 0.364 3.199 0.364 3.199 NSL_RCVD_FROM_USER 0.548 0.001 0.548 0.001 0.001 0.159 0.001 0.159 NSL_RCVD_HELO_USER 1.273 0.001 1.273 0.001 2.599 0.970 2.599 0.970 PP_MIME_FAKE_ASCII_TEXT 0.429 0.001 0.429 0.001 0.899 0.001 0.899 0.001 RCVD_IN_MSPIKE_H2 0.001 -2.800 0.001 -2.800 0.001 -1.271 0.001 -1.271 RCVD_IN_MSPIKE_L3 0.001 0.001 0.001 0.001 0.001 1.282 0.001 1.282 RCVD_IN_MSPIKE_L4 0.001 0.001 0.001 0.001 0.001 0.147 0.001 0.147 RCVD_IN_MSPIKE_L5 0.001 0.001 0.001 0.001 0.001 2.283 0.001 2.283 RCVD_IN_MSPIKE_ZBI 0.001 0.001 0.001 0.001 0.001 3.496 0.001 3.496 SPOOFED_FREEM_REPTO 2.498 1.368 2.498 1.368 3.899 3.396 3.899 3.396 STATIC_XPRIO_OLE 1.997 0.001 1.997 0.001 1.668 0.277 1.668 0.277 STYLE_GIBBERISH 2.800 3.093 2.800 3.093 3.499 2.554 3.499 2.554 THIS_AD 0.596 2.200 0.596 2.200 0.001 0.001 0.001 0.001 TO_EQ_FM_DIRECT_MX 2.497 0.622 2.497 0.622 2.799 2.997 2.799 2.997 TO_IN_SUBJ 0.099 0.099 0.099 0.099 2.099 0.001 2.099 0.001 TO_NO_BRKTS_HTML_IMG 0.001 2.000 0.001 2.000 0.001 0.799 0.001 0.799 TO_NO_BRKTS_HTML_ONLY 1.997 0.001 1.997 0.001 0.001 0.001 0.001 0.001 TO_NO_BRKTS_MSFT 2.497 0.001 2.497 0.001 0.602 3.595 0.602 3.595 TO_NO_BRKTS_NORDNS_HTML 0.398 0.001 0.398 0.001 0.001 0.339 0.001 0.339 TO_NO_BRKTS_PCNT 2.497 0.001 2.497 0.001 2.699 2.196 2.699 2.196 TVD_SPACE_ENCODED 2.497 0.001 2.497 0.001 3.599 3.096 3.599 3.096 TVD_SPACE_ENC_FM_MIME 1.997 0.001 1.997 0.001 3.499 2.996 3.499 2.996 TVD_SPACE_RATIO_MINFP 2.497 0.001 2.497 0.001 4.399 4.395 4.399 4.395 URI_ONLY_MSGID_MALF 0.001 1.191 0.001 1.191 2.800 2.880 2.800 2.880 URI_TRY_3LD 0.195 0.001 0.195 0.001 0.001 0.227 0.001 0.227 URI_TRY_USME 0.001 0.001 0.001 0.001 3.299 2.896 3.299 2.896 URI_WPADMIN 3.396 3.014 3.396 3.014 2.899 2.498 2.899 2.498 URI_WP_DIRINDEX 1.000 1.000 1.000 1.000 4.399 3.995 4.399 3.995 URI_WP_HACKED 2.996 3.000 2.996 3.000 4.099 3.146 4.099 3.146 URI_WP_HACKED_2 1.187 1.764 1.187 1.764 1.479 3.412 1.479 3.412 XPRIO 2.248 2.249 2.248 2.249 0.367 0.140 0.367 0.140
#!/usr/bin/perl -w # # Informational tool to compare files with rules in them in a human # readable way. # Compare 72_scores.cf files out of the box. But can basically compare # any file containing rule names. # Example for comparing range.data files: # ./compare-rulefiles -r 3 ranges.data.2017-11-09 ranges.data.2017-11-15 # # <@LICENSE> # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to you under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at: # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # </@LICENSE> use strict; # $hashref = readrulefile($filename,$rulename_column[,$ignore_columns]); sub readrulefile($$;$) { my $filename = shift; my $rulename_column = shift; my $ignore_columns = shift || []; my %ignore_columns_map = map { $_ => 1 } @{$ignore_columns}; my $rules = {}; open (IN, "<$filename"); while (<IN>) { # strip comment lines, starting whitespace and empty lines s/#.*$//g; s/^\s+//; s/\s+$//; next if /^$/; my @columns = split(/\s+/); my $num_columns = scalar @columns; $rules->{$columns[$rulename_column]} = []; for( my $i=0; $i<$num_columns; $i++ ) { next if $i==$rulename_column; next if $ignore_columns_map{$i}; push(@{$rules->{$columns[$rulename_column]}}, $columns[$i]); } } close IN; return $rules; } # ($only_1, $only_2, $different) = diffrules($rules1, $rules2); sub diffrules($$) { my $rules1 = shift; my $rules2 = shift; my $only_1 = {}; my $different = {}; foreach my $key (keys %{$rules1}) { if(exists($rules2->{$key})){ # in both arrays if(join(' ', @{$rules1->{$key}}) ne join(' ', @{$rules2->{$key}})){ # and different $different->{$key}=1; } }else{ # only in 1 $only_1->{$key}=1; } } my $only_2 = {}; foreach my $key (keys %{$rules2}) { if(!exists($rules1->{$key})){ # only in 2 $only_2->{$key}=1; } } return ($only_1, $only_2, $different); } # writediff($rules1, $rules2, $only_1, $only_2, $different); sub writediff($$$$$) { my $rules1 = shift; my $rules2 = shift; my $only_1 = shift; my $only_2 = shift; my $different = shift; print "Only in 1 (removed in 2)\n"; foreach my $rule (sort keys %{$only_1}){ print "$rule\n"; } print "\n"; print "Only in 2 (added in 2)\n"; foreach my $rule (sort keys %{$only_2}){ print "$rule\n"; } print "\n"; print "Changed\n"; foreach my $rule (sort keys %{$different}){ print "$rule\n "; print join(' ', @{$rules1->{$rule}}); print "\n "; print join(' ', @{$rules2->{$rule}}); print "\n"; } print "\n"; } sub usage { die " compare-rulefiles: informational tool to compare files with rules in them, in a human readable way usage: ./compare-rulefiles [OPTION]... FILE1 FILE2 All options with an <colnum> value: this is the column in the input file, starting counting from zero. Columns are whitespace separated. -i, --ignore=<colnum> Which column to ignore. Can be specified multiple times. -r, rulename=<colnum> Which column contains the rule name. Defaults to 1 (second column). -h, --help This help. "; } use Getopt::Long; use vars qw(@ignore_columns $rulename); GetOptions ( "ignore=i" => \@ignore_columns, "rulename=i" => \$rulename, "help|h|?" => sub { usage(); } ); if ( !defined $rulename ) { # assume defaults $rulename = 1; if(scalar @ignore_columns == 0) { @ignore_columns=(0); } } my ($file1, $file2) = @ARGV; if (not defined $file1) { warn "Missing FILE1 argument!\n"; usage(); } if (not defined $file2) { warn "Missing FILE1 argument!\n"; usage(); } if (! -r $file1) { warn "File FILE1 ($file1) not readable file!\n"; usage(); } if (! -r $file2) { warn "File FILE2 ($file2) not readable file!\n"; usage(); } my $rules1 = readrulefile($file1, $rulename, \@ignore_columns); my $rules2 = readrulefile($file2, $rulename, \@ignore_columns); my ($only_1, $only_2, $different) = diffrules($rules1, $rules2); writediff($rules1, $rules2, $only_1, $only_2, $different); exit;
