Index: lib/Mail/SpamAssassin/AsyncLoop.pm
===================================================================
--- lib/Mail/SpamAssassin/AsyncLoop.pm	(revision 1790797)
+++ lib/Mail/SpamAssassin/AsyncLoop.pm	(working copy)
@@ -257,6 +257,16 @@
 
 sub bgsend_and_start_lookup {
   my($self, $domain, $type, $class, $ent, $cb, %options) = @_;
+
+  # At this point the $domain should already be encoded to UTF-8 and
+  # IDN converted to ASCII-compatible encoding (ACE).  Make sure this is
+  # really the case in order to be able to catch any leftover omissions.
+  if (utf8::is_utf8($domain)) {
+    warn "bgsend_and_start_lookup: domain name in Unicode, expected octets: $domain\n";
+  } elsif ($domain =~ tr/\x00-\x7F//c) {  # is not all-ASCII
+    info("bgsend_and_start_lookup: non-ASCII domain name: %s", $domain);
+  }
+
   $ent = {}  if !$ent;
   $domain =~ s/\.+\z//s;  # strip trailing dots, these sometimes still sneak in
   $ent->{id} = undef;
Index: lib/Mail/SpamAssassin/Conf.pm
===================================================================
--- lib/Mail/SpamAssassin/Conf.pm	(revision 1790797)
+++ lib/Mail/SpamAssassin/Conf.pm	(working copy)
@@ -82,13 +82,12 @@
 use bytes;
 use re 'taint';
 
-use Mail::SpamAssassin::Util;
 use Mail::SpamAssassin::NetSet;
 use Mail::SpamAssassin::Constants qw(:sa :ip);
 use Mail::SpamAssassin::Conf::Parser;
 use Mail::SpamAssassin::Logger;
 use Mail::SpamAssassin::Util::TieOneStringHash;
-use Mail::SpamAssassin::Util qw(untaint_var);
+use Mail::SpamAssassin::Util qw(untaint_var idn_to_ascii);
 use File::Spec;
 
 use vars qw{
@@ -3473,8 +3472,11 @@
 
 =item util_rb_tld tld1 tld2 ...
 
-This option maintains list of valid TLDs in the RegistryBoundaries code. 
-TLDs include things like com, net, org, etc.
+This option maintains a list of valid TLDs in the RegistryBoundaries code. 
+Top level domains (TLD) include things like com, net, org, xn--p1ai, рф, ...
+International domain names may be specified in ASCII-compatible encoding (ACE),
+e.g. xn--p1ai, xn--qxam, or with Unicode labels encoded as UTF-8 octets,
+e.g. рф, ελ.
 
 =cut
 
@@ -3537,7 +3539,7 @@
     xn--wgbh1c xn--wgbl6a xn--xhq521b xn--xkc2al3hye2a xn--xkc2dl3a5ee0h
     xn--yfro4i67o xn--ygbi2ammx xn--zfr164b xxx xyz yachts yandex ye yokohama
     youtube yt za zm zone zw
-    /) { $self->{valid_tlds}{lc $_} = 1; }
+    /) { $self->{valid_tlds}{idn_to_ascii($_)} = 1 }
 
   push (@cmds, {
     setting => 'util_rb_tld',
@@ -3551,7 +3553,7 @@
 	return $INVALID_VALUE;
       }
       foreach (split(/\s+/, $value)) {
-        $self->{valid_tlds}{lc $_} = 1;
+        $self->{valid_tlds}{idn_to_ascii($_)} = 1;
       }
       dbg("config: added tld list - $value");
     }
@@ -3560,7 +3562,9 @@
 =item util_rb_2tld 2tld-1.tld 2tld-2.tld ...
 
 This option maintains list of valid 2nd-level TLDs in the RegistryBoundaries
-code.  2TLDs include things like co.uk, fed.us, etc.
+code.  2TLDs include things like co.uk, fed.us, etc.  International domain
+names may be specified in ASCII-compatible encoding (ACE), or with Unicode
+labels encoded as UTF-8 octets.
 
 =cut
 
@@ -3731,7 +3735,7 @@
     net.ye org.ye ac.za alt.za bourse.za city.za co.za edu.za gov.za law.za
     mil.za net.za ngo.za nom.za org.za school.za tm.za web.za ac.zm co.zm
     com.zm edu.zm gov.zm org.zm sch.zm ac.zw co.zw gov.zw org.zw
-    /) { $self->{two_level_domains}{lc $_} = 1; }
+    /) { $self->{two_level_domains}{idn_to_ascii($_)} = 1 }
 
   push (@cmds, {
     setting => 'util_rb_2tld',
@@ -3745,7 +3749,7 @@
 	return $INVALID_VALUE;
       }
       foreach (split(/\s+/, $value)) {
-        $self->{two_level_domains}{lc $_} = 1;
+        $self->{two_level_domains}{idn_to_ascii($_)} = 1;
       }
     }
   });
@@ -3753,7 +3757,9 @@
 =item util_rb_3tld 3tld1.some.tld 3tld2.other.tld ...
 
 This option maintains list of valid 3rd-level TLDs in the RegistryBoundaries
-code.  3TLDs include things like demon.co.uk, plc.co.im, etc.
+code.  3TLDs include things like demon.co.uk, plc.co.im, etc.  International
+domain names may be specified in ASCII-compatible encoding (ACE), or with
+Unicode labels encoded as UTF-8 octets.
 
 =cut
 
@@ -3762,7 +3768,7 @@
   # sa-update 20_aux_tlds.cf.
   foreach (qw/
     demon.co.uk esc.edu.ar lkd.co.im plc.co.im
-    /) { $self->{three_level_domains}{lc $_} = 1; }
+    /) { $self->{three_level_domains}{idn_to_ascii($_)} = 1 }
 
   push (@cmds, {
     setting => 'util_rb_3tld',
@@ -3776,7 +3782,7 @@
 	return $INVALID_VALUE;
       }
       foreach (split(/\s+/, $value)) {
-        $self->{three_level_domains}{lc $_} = 1;
+        $self->{three_level_domains}{idn_to_ascii($_)} = 1;
       }
     }
   });
Index: lib/Mail/SpamAssassin/Constants.pm
===================================================================
--- lib/Mail/SpamAssassin/Constants.pm	(revision 1790797)
+++ lib/Mail/SpamAssassin/Constants.pm	(working copy)
@@ -46,6 +46,7 @@
 	MAX_BODY_LINE_LENGTH MAX_HEADER_KEY_LENGTH MAX_HEADER_VALUE_LENGTH
 	MAX_HEADER_LENGTH ARITH_EXPRESSION_LEXER AI_TIME_UNKNOWN
 	CHARSETS_LIKELY_TO_FP_AS_CAPS MAX_URI_LENGTH
+        $WHITESPACE_UTF8_RE $ALT_FULLSTOP_UTF8_RE
   );
 
   %EXPORT_TAGS = (
@@ -391,6 +392,94 @@
         [\?:]                                   # ? : Operator
       )/ox;
 
+
+our($WHITESPACE_UTF8_RE, $ALT_FULLSTOP_UTF8_RE);
+BEGIN {
+  # http://en.wikipedia.org/wiki/Whitespace_character
+  # Unicode property \p{Space} plus a 'ZERO WIDTH SPACE' U+200B
+  my $wsp_chars = "\x{0009}\x{000A}\x{000B}\x{000C}\x{000D}\x{0020}\x{0085}".
+                  "\x{00A0}\x{1680}\x{2000}\x{2001}\x{2002}\x{2003}\x{2004}".
+                  "\x{2005}\x{2006}\x{2007}\x{2008}\x{2009}\x{200A}\x{200B}".
+                  "\x{2028}\x{2029}\x{202F}\x{205F}\x{3000}";
+  my $wsp_bytes = join('|', split(//,$wsp_chars));  utf8::encode($wsp_bytes);
+  $WHITESPACE_UTF8_RE = qr/$wsp_bytes/so;
+
+  # Bug 6751:
+  # RFC 3490 (IDNA): Whenever dots are used as label separators, the
+  #   following characters MUST be recognized as dots: U+002E (full stop),
+  #   U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
+  #   U+FF61 (halfwidth ideographic full stop).
+  # RFC 5895: [...] the IDEOGRAPHIC FULL STOP character (U+3002)
+  #   can be mapped to the FULL STOP before label separation occurs.
+  #   [...] Only the IDEOGRAPHIC FULL STOP character (U+3002) is added in
+  #   this mapping because the authors have not fully investigated [...]
+  # Adding also 'SMALL FULL STOP' (U+FE52) as seen in the wild,
+  # and a 'ONE DOT LEADER' (U+2024).
+  #
+  my $dot_chars = "\x{2024}\x{3002}\x{FF0E}\x{FF61}\x{FE52}";  # \x{002E}
+  my $dot_bytes = join('|', split(//,$dot_chars));  utf8::encode($dot_bytes);
+  $ALT_FULLSTOP_UTF8_RE = qr/$dot_bytes/so;
+}
+
+# http://en.wikipedia.org/wiki/Whitespace_character
+# Unicode property \p{Space} plus a 'ZERO WIDTH SPACE' U+200B
+sub InIDNAWhitespace {
+  return <<'END';
++\p{Space}
++200B
+END
+}
+
+sub InIDNAFullStop {
+  return <<'END';
+002E
+2024
+3002
+FE52
+FF0E
+FF61
+END
+}
+
+# http://unicode.org/faq/idn.html IDNA2008, perlunicode(1) man page
+sub InIDNA2008 {
+  return <<'END';
+!utf8::Changes_When_NFKC_Casefolded
+-utf8::c
+-utf8::z
+-utf8::s
+-utf8::p
+-utf8::nl
+-utf8::no
+-utf8::me
+-utf8::HST=L
+-utf8::HST=V
+-utf8::HST=V
+-utf8::block=Combining_Diacritical_Marks_For_Symbols
+-utf8::block=Ancient_Greek_Musical_Notation
+-utf8::block=Musical_Symbols
+-0640
+-07FA
+-302E
+-302F
+-3031 3035
+-303B
++00B7
++0375
++05F3
++05F4
++30FB
++002D
++06FD
++06FE
++0F0B
++3007
++00DF
++03C2
++utf8::JoinControl
+END
+}
+
 # ArchiveIterator
 
 # if AI doesn't read in the message in the first pass to see if the received
Index: lib/Mail/SpamAssassin/Dns.pm
===================================================================
--- lib/Mail/SpamAssassin/Dns.pm	(revision 1790797)
+++ lib/Mail/SpamAssassin/Dns.pm	(working copy)
@@ -29,7 +29,7 @@
 use Mail::SpamAssassin::PerMsgStatus;
 use Mail::SpamAssassin::AsyncLoop;
 use Mail::SpamAssassin::Constants qw(:ip);
-use Mail::SpamAssassin::Util qw(untaint_var am_running_on_windows);
+use Mail::SpamAssassin::Util qw(untaint_var am_running_on_windows idn_to_ascii);
 
 use File::Spec;
 use IO::Socket;
@@ -101,6 +101,7 @@
 sub do_rbl_lookup {
   my ($self, $rule, $set, $type, $host, $subtest) = @_;
 
+  $host = idn_to_ascii($host);
   $host =~ s/\.\z//s;  # strip a redundant trailing dot
   my $key = "dns:$type:$host";
   my $existing_ent = $self->{async}->get_lookup($key);
@@ -145,6 +146,7 @@
 sub do_dns_lookup {
   my ($self, $rule, $type, $host) = @_;
 
+  $host = idn_to_ascii($host);
   $host =~ s/\.\z//s;  # strip a redundant trailing dot
   my $key = "dns:$type:$host";
 
Index: lib/Mail/SpamAssassin/DnsResolver.pm
===================================================================
--- lib/Mail/SpamAssassin/DnsResolver.pm	(revision 1790797)
+++ lib/Mail/SpamAssassin/DnsResolver.pm	(working copy)
@@ -45,7 +45,7 @@
 use Mail::SpamAssassin;
 use Mail::SpamAssassin::Logger;
 use Mail::SpamAssassin::Constants qw(:ip);
-use Mail::SpamAssassin::Util qw(untaint_var decode_dns_question_entry);
+use Mail::SpamAssassin::Util qw(untaint_var decode_dns_question_entry idn_to_ascii);
 
 use Socket;
 use Errno qw(EADDRINUSE EACCES);
@@ -920,8 +920,9 @@
   # using some arbitrary encoding (they are normally just 7-bit ascii
   # characters anyway, just need to get rid of the utf8 flag).  Bug 6959
   # Most if not all af these come from a SPF plugin.
+  #   (was a call to utf8::encode($name), now we prefer a proper idn_to_ascii)
   #
-  utf8::encode($name);
+  $name = idn_to_ascii($name);
 
   my $retrans = $self->{retrans};
   my $retries = $self->{retry};
Index: lib/Mail/SpamAssassin/PerMsgStatus.pm
===================================================================
--- lib/Mail/SpamAssassin/PerMsgStatus.pm	(revision 1790797)
+++ lib/Mail/SpamAssassin/PerMsgStatus.pm	(working copy)
@@ -59,7 +59,7 @@
 use Mail::SpamAssassin::Constants qw(:sa);
 use Mail::SpamAssassin::AsyncLoop;
 use Mail::SpamAssassin::Conf;
-use Mail::SpamAssassin::Util qw(untaint_var uri_list_canonicalize);
+use Mail::SpamAssassin::Util qw(untaint_var uri_list_canonicalize idn_to_ascii);
 use Mail::SpamAssassin::Timeout;
 use Mail::SpamAssassin::Logger;
 
@@ -1716,16 +1716,18 @@
     $self->{$item} = $self->{msg}->{metadata}->{$item};
   }
 
-  # TODO: International domain names (UTF-8) must be converted to
-  # ASCII-compatible encoding (ACE) for the purpose of setting the
-  # SENDERDOMAIN and AUTHORDOMAIN tags (and probably for other uses too).
-  # (explicitly required for DMARC, draft-kucherawy-dmarc-base sect. 5.6.1)
+  # International domain names (UTF-8) must be converted to ASCII-compatible
+  # encoding (ACE) for the purpose of setting the SENDERDOMAIN and AUTHORDOMAIN
+  # tags (explicitly required for DMARC, RFC 7489)
   #
   { local $1;
     my $addr = $self->get('EnvelopeFrom:addr', undef);
     # collect a FQDN, ignoring potential trailing WSP
     if (defined $addr && $addr =~ /\@([^@. \t]+\.[^@ \t]+?)[ \t]*\z/s) {
-      $self->set_tag('SENDERDOMAIN', lc $1);
+      my $d = idn_to_ascii($1);
+      $self->set_tag('SENDERDOMAIN', $d);
+      $self->{msg}->put_metadata("X-SenderDomain", $d);
+      dbg("metadata: X-SenderDomain: %s", $d);
     }
     # TODO: the get ':addr' only returns the first address; this should be
     # augmented to be able to return all addresses in a header field, multiple
@@ -1732,7 +1734,10 @@
     # addresses in a From header field are allowed according to RFC 5322
     $addr = $self->get('From:addr', undef);
     if (defined $addr && $addr =~ /\@([^@. \t]+\.[^@ \t]+?)[ \t]*\z/s) {
-      $self->set_tag('AUTHORDOMAIN', lc $1);
+      my $d = idn_to_ascii($1);
+      $self->set_tag('AUTHORDOMAIN', $d);
+      $self->{msg}->put_metadata("X-AuthorDomain", $d);
+      dbg("metadata: X-AuthorDomain: %s", $d);
     }
   }
 
@@ -2131,14 +2136,14 @@
 
   # knownscheme regexp looks for either a https?: or ftp: scheme, or www\d*\. or ftp\. prefix, i.e., likely to start a URL
   # schemeless regexp looks for a valid TLD at the end of what may be a FQDN, followed by optional ., optional :portnum, optional /rest_of_uri
-  my $urischemeless = qr/[a-z\d][a-z\d._-]{0,251}\.${tldsRE}\.?(?::\d{1,5})?(?:\/[^$tbirdenddelim]{1,251})?/io;
-  my $uriknownscheme = qr/(?:(?:(?:(?:https?)|(?:ftp)):(?:\/\/)?)|(?:(?:www\d{0,2}|ftp)\.))[^$tbirdenddelim]{1,251}/io;
-  my $urimailscheme = qr/(?:mailto:)?[^$tbirdenddelimplusat]{1,251}@[^$tbirdenddelimemail]{1,251}/io;
+  my $urischemeless = qr/[a-z\d][a-z\d._-]{0,251}\.${tldsRE}\.?(?::\d{1,5})?(?:\/[^$tbirdenddelim]{1,251}?)?/io;
+  my $uriknownscheme = qr/(?:(?:(?:(?:https?)|(?:ftp)):(?:\/\/)?)|(?:(?:www\d{0,2}|ftp)\.))[^$tbirdenddelim]{1,251}?/io;
+  my $urimailscheme = qr/(?:mailto:)?[^$tbirdenddelimplusat]{1,251}@[^$tbirdenddelimemail]{1,251}?/io;
 
   $self->{tbirdurire} = qr/(?:\b|(?<=$iso2022shift)|(?<=[$tbirdstartdelim]))
-                        (?:(?:($uriknownscheme)(?=(?:[$tbirdenddelim]|\z))) |
-                        (?:($urimailscheme)(?=(?:[$tbirdenddelimemail]|\z))) |
-                        (?:\b($urischemeless)(?=(?:[$tbirdenddelim]|\z))))/xo;
+                        (?:(?:($uriknownscheme)(?=(?:[$tbirdenddelim]|$WHITESPACE_UTF8_RE|\z))) |
+                        (?:($urimailscheme)(?=(?:[$tbirdenddelimemail]|$WHITESPACE_UTF8_RE|\z))) |
+                        (?:\b($urischemeless)(?=(?:[$tbirdenddelim]|$WHITESPACE_UTF8_RE|\z))))/xo;
 
   return $self->{tbirdurire};
 }
Index: lib/Mail/SpamAssassin/Plugin/AskDNS.pm
===================================================================
--- lib/Mail/SpamAssassin/Plugin/AskDNS.pm	(revision 1790797)
+++ lib/Mail/SpamAssassin/Plugin/AskDNS.pm	(working copy)
@@ -189,7 +189,7 @@
 use re 'taint';
 
 use Mail::SpamAssassin::Plugin;
-use Mail::SpamAssassin::Util qw(decode_dns_question_entry);
+use Mail::SpamAssassin::Util qw(decode_dns_question_entry idn_to_ascii);
 use Mail::SpamAssassin::Logger;
 
 use vars qw(@ISA %rcode_value $txtdata_can_provide_a_list);
@@ -465,6 +465,7 @@
       $query_domain =~ s{_([A-Z][A-Z0-9]*)_}
                         { defined $current_tag_val{$1} ? $current_tag_val{$1}
                                                        : '' }ge;
+      $query_domain = idn_to_ascii($query_domain);
 
       # the $dnskey identifies this query in AsyncLoop's pending_lookups
       my $dnskey = join(':', 'askdns', $query_type, $query_domain);
Index: lib/Mail/SpamAssassin/Plugin/DKIM.pm
===================================================================
--- lib/Mail/SpamAssassin/Plugin/DKIM.pm	(revision 1790797)
+++ lib/Mail/SpamAssassin/Plugin/DKIM.pm	(working copy)
@@ -122,6 +122,7 @@
 use Mail::SpamAssassin::Plugin;
 use Mail::SpamAssassin::Logger;
 use Mail::SpamAssassin::Timeout;
+use Mail::SpamAssassin::Util qw(idn_to_ascii);
 
 use strict;
 use warnings;
@@ -1048,12 +1049,13 @@
           my $err = $timer->run_and_catch(sub {
             eval {
               if (Mail::DKIM::AuthorDomainPolicy->UNIVERSAL::can("fetch")) {
-                dbg("dkim: adsp: performing lookup on _adsp._domainkey.%s",
-                    $author_domain);
-                # get our Net::DNS::Resolver object
-                my $res = $self->{main}->{resolver}->get_resolver;
-                $practices = Mail::DKIM::AuthorDomainPolicy->fetch(
-                               Protocol => "dns", Domain => $author_domain,
+                my $author_domain_ace = idn_to_ascii($author_domain);
+                 dbg("dkim: adsp: performing lookup on _adsp._domainkey.%s",
+                    $author_domain_ace);
+                 # get our Net::DNS::Resolver object
+                 my $res = $self->{main}->{resolver}->get_resolver;
+                 $practices = Mail::DKIM::AuthorDomainPolicy->fetch(
+                               Protocol => "dns", Domain => $author_domain_ace,
                                DnsResolver => $res);
               }
               1;
Index: lib/Mail/SpamAssassin/Plugin/HeaderEval.pm
===================================================================
--- lib/Mail/SpamAssassin/Plugin/HeaderEval.pm	(revision 1790797)
+++ lib/Mail/SpamAssassin/Plugin/HeaderEval.pm	(working copy)
@@ -25,6 +25,8 @@
 
 use Mail::SpamAssassin::Plugin;
 use Mail::SpamAssassin::Locales;
+use Mail::SpamAssassin::Util qw(get_my_locales parse_rfc822_date
+                                idn_to_ascii is_valid_utf_8);
 use Mail::SpamAssassin::Logger;
 use Mail::SpamAssassin::Constants qw(:sa :ip);
 
@@ -276,6 +278,17 @@
     $str =~ s/^(?:Subject|From):.*$//gmi;
   }
 
+  if ($str =~ tr/\x00-\x7F//c && is_valid_utf_8($str)) {
+    # is non-ASCII and is valid UTF-8
+    if ($str =~ tr/\x00-\x08\x0B\x0C\x0E-\x1F//) {
+      dbg("eval: %s is valid UTF-8 but contains controls: %s", $header, $str);
+    } else {
+      # todo: only with a SMTPUTF8 mail
+      dbg("eval: %s is valid UTF-8: %s", $header, $str);
+      return 0;
+    }
+  }
+
   # count illegal substrings (RFC 2045)
   # (non-ASCII + C0 controls except TAB, NL, CR)
   my $illegal = $str =~ tr/\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff//;
@@ -1035,7 +1048,7 @@
   return 0 if $from eq '' || $to eq '';
   return 0 if $from =~ /^SRS\d=/;
 
-  if ($to =~ /^([^@]+)@(.+)$/) {
+  if ($to =~ /^([^@]+)\@(.+)$/) {
     my($user,$dom) = ($1,$2);
     $dom = $self->{main}->{registryboundaries}->trim_domain($dom);
     return unless
Index: lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm
===================================================================
--- lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm	(revision 1790797)
+++ lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm	(working copy)
@@ -294,7 +294,7 @@
 
 use Mail::SpamAssassin::Plugin;
 use Mail::SpamAssassin::Constants qw(:ip);
-use Mail::SpamAssassin::Util;
+use Mail::SpamAssassin::Util qw(idn_to_ascii);
 use Mail::SpamAssassin::Logger;
 use strict;
 use warnings;
@@ -901,6 +901,7 @@
 sub lookup_domain_ns {
   my ($self, $pms, $obj, $dom, $rulename) = @_;
 
+  $dom = idn_to_ascii($dom);
   my $key = "NS:" . $dom;
   my $ent = {
     key => $key, zone => $dom, obj => $obj, type => "URI-NS",
@@ -986,6 +987,7 @@
 sub lookup_a_record {
   my ($self, $pms, $obj, $hname, $rulename) = @_;
 
+  $hname = idn_to_ascii($hname);
   my $key = "A:" . $hname;
   my $ent = {
     key => $key, zone => $hname, obj => $obj, type => "URI-A",
@@ -1059,6 +1061,7 @@
 sub lookup_single_dnsbl {
   my ($self, $pms, $obj, $rulename, $lookupstr, $dnsbl, $qtype) = @_;
 
+  $dnsbl = idn_to_ascii($dnsbl);
   my $key = "DNSBL:" . $lookupstr . ':' . $dnsbl;
   my $ent = {
     key => $key, zone => $dnsbl, obj => $obj, type => 'URI-DNSBL',
Index: lib/Mail/SpamAssassin/RegistryBoundaries.pm
===================================================================
--- lib/Mail/SpamAssassin/RegistryBoundaries.pm	(revision 1790797)
+++ lib/Mail/SpamAssassin/RegistryBoundaries.pm	(working copy)
@@ -33,6 +33,9 @@
 our @ISA = qw();
 use vars qw(%US_STATES);
 
+use Mail::SpamAssassin::Logger;
+use Mail::SpamAssassin::Util qw(idn_to_ascii);
+
 # called from SpamAssassin->init() to create $self->{util_rb}
 sub new {
   my $class = shift;
@@ -46,7 +49,8 @@
   bless ($self, $class);
 
   # Initialize valid_tlds_re for schemeless uri parsing, FreeMail etc
-  if ($self->{conf}->{valid_tlds}) {
+  if ($self->{conf}->{valid_tlds} && %{$self->{conf}->{valid_tlds}}) {
+    # International domain names are already in ASCII-compatible encoding (ACE)
     my $tlds = join('|', keys %{$self->{conf}->{valid_tlds}});
     # Perl 5.10+ trie optimizes lists, no need for fancy regex optimizing
     $self->{valid_tlds_re} = qr/(?:$tlds)/i;
@@ -87,9 +91,9 @@
 =cut
 
 sub split_domain {
-  my $self = shift;
-  my $domain = lc shift;
+  my ($self, $domain) = @_;
 
+  $domain = idn_to_ascii($domain);
   my $hostname = '';
 
   if (defined $domain && $domain ne '') {
@@ -126,6 +130,7 @@
         }
         else {
           my $temp = join(".", @domparts);
+          # International domain names in ASCII-compatible encoding (ACE)
           last if ($self->{conf}->{three_level_domains}{$temp});
         }
       }
@@ -132,6 +137,7 @@
       elsif (@domparts == 2) {
         # co.uk, etc.
         my $temp = join(".", @domparts);
+        # International domain names in ASCII-compatible encoding (ACE)
         last if ($self->{conf}->{two_level_domains}{$temp});
       }
       push(@hostname, shift @domparts);
@@ -185,12 +191,13 @@
 =cut
 
 sub is_domain_valid {
-  my $self = shift;
-  my $dom = lc shift;
+  my ($self, $dom) = @_;
 
   # domains don't have whitespace
   return 0 if ($dom =~ /\s/);
 
+  $dom = idn_to_ascii($dom);
+
   # ensure it ends in a known-valid TLD, and has at least 1 dot
   return 0 unless ($dom =~ /\.([^.]+)$/);
   return 0 unless ($self->{conf}->{valid_tlds}{$1});
Index: lib/Mail/SpamAssassin/Util.pm
===================================================================
--- lib/Mail/SpamAssassin/Util.pm	(revision 1790797)
+++ lib/Mail/SpamAssassin/Util.pm	(working copy)
@@ -63,10 +63,13 @@
   @EXPORT_OK = qw(&local_tz &base64_decode &untaint_var &untaint_file_path
                   &exit_status_str &proc_status_ok &am_running_on_windows
                   &reverse_ip_address &decode_dns_question_entry
-                  &secure_tmpfile &secure_tmpdir &uri_list_canonicalize);
+                  &secure_tmpfile &secure_tmpdir &uri_list_canonicalize
+                  &get_my_locales &parse_rfc822_date &idn_to_ascii
+                  &is_valid_utf_8);
 }
 
 use Mail::SpamAssassin;
+use Mail::SpamAssassin::Constants qw(:sa);
 
 use Config;
 use IO::Handle;
@@ -75,6 +78,7 @@
 use Time::Local;
 use Sys::Hostname (); # don't import hostname() into this namespace!
 use NetAddr::IP 4.000;
+use Scalar::Util qw(tainted);
 use Fcntl;
 use Errno qw(ENOENT EACCES EEXIST);
 use POSIX qw(:sys_wait_h WIFEXITED WIFSIGNALED WIFSTOPPED WEXITSTATUS
@@ -97,6 +101,59 @@
 
 ###########################################################################
 
+our $enc_utf8;
+BEGIN {
+  eval { require Encode }
+    and do { $enc_utf8 = Encode::find_encoding('UTF-8') }
+};
+
+our $have_libidn;
+BEGIN {
+  eval { require Net::LibIDN } and do { $have_libidn = 1 };
+}
+
+$have_libidn or warn "INFO: module Net::LibIDN not available,\n".
+  "  internationalized domain names with U-labels will not be recognized!\n";
+
+###########################################################################
+
+our $ALT_FULLSTOP_UTF8_RE;
+BEGIN {
+  # Bug 6751:
+  # RFC 3490 (IDNA): Whenever dots are used as label separators, the
+  #   following characters MUST be recognized as dots: U+002E (full stop),
+  #   U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
+  #   U+FF61 (halfwidth ideographic full stop).
+  # RFC 5895: [...] the IDEOGRAPHIC FULL STOP character (U+3002)
+  #   can be mapped to the FULL STOP before label separation occurs.
+  #   [...] Only the IDEOGRAPHIC FULL STOP character (U+3002) is added in
+  #   this mapping because the authors have not fully investigated [...]
+  # Adding also 'SMALL FULL STOP' (U+FE52) as seen in the wild,
+  # and a 'ONE DOT LEADER' (U+2024).
+  #
+  my $dot_chars = "\x{2024}\x{3002}\x{FF0E}\x{FF61}\x{FE52}";  # \x{002E}
+  my $dot_bytes = join('|', split(//,$dot_chars));  utf8::encode($dot_bytes);
+  $ALT_FULLSTOP_UTF8_RE = qr/$dot_bytes/so;
+}
+
+###########################################################################
+
+our $enc_utf8;
+BEGIN {
+  eval { require Encode }
+    and do { $enc_utf8 = Encode::find_encoding('UTF-8') }
+};
+
+our $have_libidn;
+BEGIN {
+  eval { require Net::LibIDN } and do { $have_libidn = 1 };
+}
+
+$have_libidn or warn "INFO: module Net::LibIDN not available,\n".
+  "  internationalized domain names with U-labels will not be recognized!\n";
+
+###########################################################################
+
 # find an executable in the current $PATH (or whatever for that platform)
 {
   # Show the PATH we're going to explore only once.
@@ -340,6 +397,200 @@
 
 ###########################################################################
 
+# returns true if the provided string of octets represents a syntactically
+# valid UTF-8 string, otherwise a false is returned
+#
+sub is_valid_utf_8($) {
+# my $octets = $_[0];
+  return undef if !defined $_[0];
+  #
+  # RFC 6532: UTF8-non-ascii = UTF8-2 / UTF8-3 / UTF8-4
+  # RFC 3629 section 4: Syntax of UTF-8 Byte Sequences
+  #   UTF8-char   = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
+  #   UTF8-1      = %x00-7F
+  #   UTF8-2      = %xC2-DF UTF8-tail
+  #   UTF8-3      = %xE0 %xA0-BF UTF8-tail /
+  #                 %xE1-EC 2( UTF8-tail ) /
+  #                 %xED %x80-9F UTF8-tail /
+  #                   # U+D800..U+DFFF are utf16 surrogates, not legal utf8
+  #                 %xEE-EF 2( UTF8-tail )
+  #   UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) /
+  #                 %xF1-F3 3( UTF8-tail ) /
+  #                 %xF4 %x80-8F 2( UTF8-tail )
+  #   UTF8-tail   = %x80-BF
+  #
+  # loose variant:
+  #   [\x00-\x7F] | [\xC0-\xDF][\x80-\xBF] |
+  #   [\xE0-\xEF][\x80-\xBF]{2} | [\xF0-\xF4][\x80-\xBF]{3}
+  #
+  $_[0] =~ /^ (?: [\x00-\x7F] |
+                  [\xC2-\xDF] [\x80-\xBF] |
+                  \xE0 [\xA0-\xBF] [\x80-\xBF] |
+                  [\xE1-\xEC] [\x80-\xBF]{2} |
+                  \xED [\x80-\x9F] [\x80-\xBF] |
+                  [\xEE-\xEF] [\x80-\xBF]{2} |
+                  \xF0 [\x90-\xBF] [\x80-\xBF]{2} |
+                  [\xF1-\xF3] [\x80-\xBF]{3} |
+                  \xF4 [\x80-\x8F] [\x80-\xBF]{2} )* \z/xs ? 1 : 0;
+}
+
+# Given an international domain name with U-labels (UTF-8 or Unicode chars)
+# converts it to ASCII-compatible encoding (ACE).  If the argument is in
+# ASCII (or is an invalid IDN), returns it lowercased but otherwise unchanged.
+# The result is always in octets (utf8 flag off) even if the argument was in
+# Unicode characters.
+#
+sub idn_to_ascii($) {
+  no bytes;
+  my $s = $_[0];
+  return undef  if !defined $s;
+  # propagate taintedness of the argument, but not its utf8 flag
+  my $t = tainted($s);  # taintedness of the argument
+  $t = untaint_var($t)  if $t;
+  # leave octets unchanged (not necessarily valid UTF-8), encode chars to UTF-8
+  utf8::encode($s)  if utf8::is_utf8($s);
+  if ($s !~ tr/\x00-\x7F//c) {  # is all-ASCII (including IP address literal)
+    $s = lc $s;
+  } elsif (!is_valid_utf_8($s)) {
+    info("util: idn_to_ascii: not valid UTF-8 (%d): /%s/", $t, $s);
+    $s = lc $s;  # garbage-in / garbage-out
+  } else {  # is valid UTF-8 but not all-ASCII
+    my $chars;
+    if (eval { $chars = $enc_utf8->decode($s,1|8); 1 }) {
+      $chars =~ s/\p{Mail::SpamAssassin::Constants::InIDNAFullStop}/./gso;
+      local $1;
+      if (lc($chars) =~ /([.\p{Mail::SpamAssassin::Constants::InIDNA2008}]+)/) {
+        $chars = $1;
+        utf8::encode($chars);
+        if ($chars ne $s) {
+          info("util: idn_to_ascii: extracted: /%s/ -> /%s/", $s, $chars);
+          $s = $chars;
+        }
+      }
+    } else {
+      # RFC 3490 (IDNA): Whenever dots are used as label separators, the
+      # following characters MUST be recognized as dots: U+002E (full stop),
+      # U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
+      # U+FF61 (halfwidth ideographic full stop).
+      $s =~ s/$ALT_FULLSTOP_UTF8_RE/./gso;
+      # trim whitespace
+my $ssv = $s;
+      $s =~ s/^$WHITESPACE_UTF8_RE+//so;
+      $s =~ s/$WHITESPACE_UTF8_RE+\z//so;
+if ($ssv ne $s) {
+      info("util: idn_to_ascii: trimmed (%d): /%s/ -> /%s/", $t, $ssv, $s);
+}
+    }
+    if (!$have_libidn) {
+      $s = lc $s;
+    } else {
+      # to ASCII-compatible encoding (ACE), lowercased
+      my $sa = Net::LibIDN::idn_to_ascii($s, 'UTF-8');
+
+if (!defined $sa) {
+  info("util: idn_to_ascii: conversion to ACE failed (%d): /%s/", $t, $s);
+} elsif ($sa ne lc $s) {
+  info("util: idn_to_ascii: converted to ACE (%d): /%s/ -> /%s/", $t, $s, $sa);
+} else {
+  info("util: idn_to_ascii: unchanged (%d): /%s/", $t, $s);
+}
+      $s = $sa  if defined $sa;
+    }
+  }
+  $t ? taint_var($s) : $s;  # propagate taintedness of the argument
+}
+
+###########################################################################
+
+# returns true if the provided string of octets represents a syntactically
+# valid UTF-8 string, otherwise a false is returned
+#
+sub is_valid_utf_8($) {
+# my $octets = $_[0];
+  return undef if !defined $_[0];
+  #
+  # RFC 6532: UTF8-non-ascii = UTF8-2 / UTF8-3 / UTF8-4
+  # RFC 3629 section 4: Syntax of UTF-8 Byte Sequences
+  #   UTF8-char   = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
+  #   UTF8-1      = %x00-7F
+  #   UTF8-2      = %xC2-DF UTF8-tail
+  #   UTF8-3      = %xE0 %xA0-BF UTF8-tail /
+  #                 %xE1-EC 2( UTF8-tail ) /
+  #                 %xED %x80-9F UTF8-tail /
+  #                   # U+D800..U+DFFF are utf16 surrogates, not legal utf8
+  #                 %xEE-EF 2( UTF8-tail )
+  #   UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) /
+  #                 %xF1-F3 3( UTF8-tail ) /
+  #                 %xF4 %x80-8F 2( UTF8-tail )
+  #   UTF8-tail   = %x80-BF
+  #
+  # loose variant:
+  #   [\x00-\x7F] | [\xC0-\xDF][\x80-\xBF] |
+  #   [\xE0-\xEF][\x80-\xBF]{2} | [\xF0-\xF4][\x80-\xBF]{3}
+  #
+  $_[0] =~ /^ (?: [\x00-\x7F] |
+                  [\xC2-\xDF] [\x80-\xBF] |
+                  \xE0 [\xA0-\xBF] [\x80-\xBF] |
+                  [\xE1-\xEC] [\x80-\xBF]{2} |
+                  \xED [\x80-\x9F] [\x80-\xBF] |
+                  [\xEE-\xEF] [\x80-\xBF]{2} |
+                  \xF0 [\x90-\xBF] [\x80-\xBF]{2} |
+                  [\xF1-\xF3] [\x80-\xBF]{3} |
+                  \xF4 [\x80-\x8F] [\x80-\xBF]{2} )* \z/xs ? 1 : 0;
+}
+
+# Given an international domain name with U-labels (UTF-8 or Unicode chars)
+# converts it to ASCII-compatible encoding (ACE).  If the argument is in
+# ASCII (or is an invalid IDN), returns it lowercased but otherwise unchanged.
+# The result is always in octets (utf8 flag off) even if the argument was in
+# Unicode characters.
+#
+sub idn_to_ascii($) {
+  no bytes;  # make sure there is no 'use bytes' in effect
+  return undef  if !defined $_[0];
+  my $s = "$_[0]";  # stringify
+  # propagate taintedness of the argument, but not its utf8 flag
+  my $t = tainted($s);  # taintedness of the argument
+  if ($t) {  # untaint $s, avoids taint-related bugs in LibIDN or in old perl
+    no re 'taint';  local $1;  $s =~ /^(.*)\z/s;
+  }
+  # encode chars to UTF-8, leave octets unchanged (not necessarily valid UTF-8)
+  utf8::encode($s)  if utf8::is_utf8($s);
+  if ($s !~ tr/\x00-\x7F//c) {  # is all-ASCII (including IP address literal)
+    $s = lc $s;
+  } elsif (!is_valid_utf_8($s)) {
+    my($package, $filename, $line) = caller;
+    info("util: idn_to_ascii: not valid UTF-8: /%s/, called from %s line %d",
+         $s, $package, $line);
+    $s = lc $s;  # garbage-in / garbage-out
+  } else {  # is valid UTF-8 but not all-ASCII
+    my $chars;
+    # RFC 3490 (IDNA): Whenever dots are used as label separators, the
+    # following characters MUST be recognized as dots: U+002E (full stop),
+    # U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
+    # U+FF61 (halfwidth ideographic full stop).
+    if ($s =~ s/$ALT_FULLSTOP_UTF8_RE/./gso) {
+      info("util: idn_to_ascii: alternative dots normalized: /%s/ -> /%s/",
+           $_[0], $s);
+    }
+    if (!$have_libidn) {
+      $s = lc $s;
+    } else {
+      # to ASCII-compatible encoding (ACE), lowercased
+      my $sa = Net::LibIDN::idn_to_ascii($s, 'UTF-8');
+      if (!defined $sa) {
+        info("util: idn_to_ascii: conversion to ACE failed: /%s/", $s);
+      } else {
+        info("util: idn_to_ascii: converted to ACE: /%s/ -> /%s/", $s, $sa);
+        $s = $sa;
+      }
+    }
+  }
+  $t ? taint_var($s) : $s;  # propagate taintedness of the argument
+}
+
+###########################################################################
+
 # map process termination status number to an informative string, and
 # append optional mesage (dual-valued errno or a string or a number),
 # returning the resulting string
@@ -1316,20 +1567,10 @@
       # not required
       $rest ||= '';
 
-      # Bug 6751:
-      # RFC 3490 (IDNA): Whenever dots are used as label separators, the
-      #   following characters MUST be recognized as dots: U+002E (full stop),
-      #   U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
-      #   U+FF61 (halfwidth ideographic full stop).
-      # RFC 5895: [...] the IDEOGRAPHIC FULL STOP character (U+3002)
-      #   can be mapped to the FULL STOP before label separation occurs.
-      #   [...] Only the IDEOGRAPHIC FULL STOP character (U+3002) is added in
-      #   this mapping because the authors have not fully investigated [...]
-      # Adding also 'SMALL FULL STOP' (U+FE52) as seen in the wild.
-      # Parhaps also the 'ONE DOT LEADER' (U+2024).
-      if ($host =~ s{(?: \xE3\x80\x82 | \xEF\xBC\x8E | \xEF\xBD\xA1 |
-                         \xEF\xB9\x92 | \xE2\x80\xA4 )}{.}xgs) {
-        push(@nuris, join ('', $proto, $host, $rest));
+      my $nhost = idn_to_ascii($host);
+      if (defined $nhost && $nhost ne lc $host) {
+        push(@nuris, join('', $proto, $nhost, $rest));
+        $host = $nhost;
       }
 
       # bug 4146: deal with non-US ASCII 7-bit chars in the host portion
@@ -1336,7 +1577,8 @@
       # of the URI according to RFC 1738 that's invalid, and the tested
       # browsers (Firefox, IE) remove them before usage...
       if ($host =~ tr/\000-\040\200-\377//d) {
-        push(@nuris, join ('', $proto, $host, $rest));
+        push(@nuris, join ('', $proto, $host, $rest))
+          if $host =~ /[^\x00-\x2F\x3A-\x40\x5B-\x60\x7B-\x7F\x80]/;
       }
 
       # deal with http redirectors.  strip off one level of redirector
@@ -1381,7 +1623,8 @@
       # the host portion should end in some form of alpha-numeric, strip off
       # the rest.
       if ($host =~ s/[^0-9A-Za-z]+$//) {
-        push(@nuris, join ('', $proto, $host, $rest));
+        push(@nuris, join ('', $proto, $host, $rest))
+          if $host =~ /[^\x00-\x2F\x3A-\x40\x5B-\x60\x7B-\x7F\x80]/;
       }
 
       ########################
Index: lib/Mail/SpamAssassin/Util/DependencyInfo.pm
===================================================================
--- lib/Mail/SpamAssassin/Util/DependencyInfo.pm	(revision 1790797)
+++ lib/Mail/SpamAssassin/Util/DependencyInfo.pm	(working copy)
@@ -132,6 +132,14 @@
   desc => 'Used when manually reporting spam to SpamCop with "spamassassin -r".',
 },
 {
+  'module' => 'Net::LibIDN',
+  'version' => 0,
+  'desc' => "Provides mapping between Internationalized Domain Names (IDN) in
+  Unicode and ASCII-compatible encoding (ACE) for use in DNS and comparisions.
+  The module is optional, but without it Unicode IDN names found in mail will
+  not be suitable for DNS queries and white/blacklisting.",
+},
+{
   module => 'Mail::SPF',
   version => 0,
   desc => 'Used to check DNS Sender Policy Framework (SPF) records to fight email
Index: rules/20_aux_tlds.cf
===================================================================
--- rules/20_aux_tlds.cf	(revision 1790797)
+++ rules/20_aux_tlds.cf	(working copy)
@@ -52,6 +52,8 @@
 #
 # For an up to date list of IDN TLDs that can be pasted into this block, run this command:
 # wget http://data.iana.org/TLD/tlds-alpha-by-domain.txt -O - | tail -n+2 | grep -i 'xn--' | tr '\n' ' ' | fold -w 80 -s | perl -e 's/^/util_rb_tld / && print lc while <>' && echo
+# Since version 4.0 the util_rb_tld also accepts Unicode IDN labels (encoded as UTF-8), e.g.:
+#  wget http://data.iana.org/TLD/tlds-alpha-by-domain.txt -q -O - | grep -i '^xn--' | idn -u | tr '\n' ' ' | fold -w 80 -s | perl -pe 'chomp; s/.*/util_rb_tld \L$_\n/'
 
 if (can(Mail::SpamAssassin::Conf::feature_registryboundaries))
 util_rb_tld xn--1qqw23a xn--30rr7y xn--3bst00m xn--3ds443g xn--3e0b707e xn--45brj9c
Index: t/data/nice/unicode1
===================================================================
--- t/data/nice/unicode1	(nonexistent)
+++ t/data/nice/unicode1	(working copy)
@@ -0,0 +1,28 @@
+Return-Path: <Marilù.Gioffré@esempio-università.it>
+Received: from mail-ig0-x248.esempio-università.it
+  (mail-ig0-x248.esempio-università.it [IPv6:2001:db8::c05:248])
+  (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
+  (No client certificate requested)
+  by Sörensen.example.com (Postfix) with UTF8SMTPS
+  for <Dörte@Sörensen.example.com>; Thu,  8 Oct 2015 07:45:14 +0200 (CEST)
+From: =?ISO-8859-1?Q?Maril=F9?= Gioffré ♥ <Marilù.Gioffré@esempio-università.it>
+To: =?iso-8859-1*sv?Q?D=F6rte_=C5._S=F6rensen,_Jr.?=
+  <Dörte@Sörensen.example.com>
+Cc: θσερ@εχαμπλε.ψομ
+Subject: =?iso-8859-2*sl?Q?Doma=e8e?=
+  =?utf-8*sl?Q?_omre=C5?=     =?Utf-8*SL?q?=BEje?=
+X-Note: The above split of UTF-8 char =C5 =BE is invalid, but seen in the wild
+Date: Mon, 05 Oct 2015 12:00:00 +0200
+Message-ID: <b497e6c2@example.срб>
+MIME-Version: 1.0
+Content-Transfer-Encoding: quoted-printable
+Content-Type: application/octet-stream; name=
+	"=?utf-8?B?0LTQvtC60YPQvNC10L3RgtGLINC00LvRjyDQvtGC0LTQ?=
+	=?utf-8?B?tdC70LAg0LrQsNC00YDQvtCyLnBkZg==?="
+Content-Disposition: attachment; filename=
+	"=?utf-8?B?0LTQvtC60YPQvNC10L3RgtGLINC00LvRjyDQvtGC0LTQ?=
+	=?utf-8?B?tdC70LAg0LrQsNC00YDQvtCyLnBkZg==?="
+X-Note: The above split of multibyte char across encoded-words is also invalid
+
+abc
+def
Index: t/header_utf8.t
===================================================================
--- t/header_utf8.t	(nonexistent)
+++ t/header_utf8.t	(working copy)
@@ -0,0 +1,206 @@
+#!/usr/bin/perl
+
+use lib '.'; use lib 't';
+use SATest; sa_t_init("header_utf8.t");
+
+use constant TEST_ENABLED => ($] >= 5.008);
+
+our $have_libidn;
+BEGIN {
+  eval { require Net::LibIDN } and do { $have_libidn = 1 };
+}
+
+use Test; BEGIN { plan tests => (TEST_ENABLED ? 156 : 0) };
+
+exit unless (TEST_ENABLED);
+
+# ---------------------------------------------------------------------------
+
+%mypatterns = (
+  q{/ LT_RPATH /}     => 'LT_RPATH',
+  q{/ LT_ENVFROM /}   => 'LT_ENVFROM',
+  q{/ LT_FROM /}      => 'LT_FROM',
+  q{/ LT_FROM_ADDR /} => 'LT_FROM_ADDR',
+  q{/ LT_FROM_NAME /} => 'LT_FROM_NAME',
+  q{/ LT_FROM_RAW /}  => 'LT_FROM_RAW',
+  q{/ LT_TO_ADDR /}   => 'LT_TO_ADDR',
+  q{/ LT_TO_NAME /}   => 'LT_TO_NAME',
+  q{/ LT_CC_ADDR /}   => 'LT_CC_ADDR',
+  q{/ LT_SUBJ /}      => 'LT_SUBJ',
+  q{/ LT_SUBJ_RAW /}  => 'LT_SUBJ_RAW',
+  q{/ LT_MESSAGEID /} => 'LT_MESSAGEID',
+  q{/ LT_MSGID /}     => 'LT_MSGID',
+  q{/ LT_CT /}        => 'LT_CT',
+  q{/ LT_CT_RAW /}    => 'LT_CT_RAW',
+  q{/ LT_AUTH_DOM /}  => 'LT_AUTH_DOM',
+  q{/ LT_NOTE /}      => 'LT_NOTE',
+  q{/ LT_UTF8SMTP_ANY /}    => 'LT_UTF8SMTP_ANY',
+  q{/ LT_SPLIT_UTF8_SUBJ /} => 'LT_SPLIT_UTF8_SUBJ',
+  q{/ USER_IN_BLACKLIST /}  => 'USER_IN_BLACKLIST',
+);
+
+%mypatterns_utf8 = (  # as it appears in a report body
+  q{/(?m)^ 0\.0 LT_ANY_CHARS \s*En-tête contient caractères$/} => 'LT_ANY_CHARS utf8',
+);
+
+%mypatterns_mime_qp = (  # as it appears in a mail header section
+  q{/(?m)^\t\*  0\.0 LT_ANY_CHARS =\?UTF-8\?Q\?En-t=C3=AAte_contient_caract=C3=A8res\?=$/} => 'LT_ANY_CHARS mime encoded',
+);
+
+%mypatterns_mime_b64 = (  # as it appears in a mail header section
+  q{/(?m)^\t\*  0\.0 LT_ANY_CHARS =\?UTF-8\?B\?5a2X56ym6KKr5YyF5ZCr5Zyo5raI5oGv5oql5aS06YOo5YiG\?=$/} => 'LT_ANY_CHARS mime encoded',
+);
+
+%mypatterns_mime_b64_bug7307 = (
+  q{/ LT_SUBJ2 /}      => 'LT_SUBJ2',
+  q{/ LT_SUBJ2_RAW /}  => 'LT_SUBJ2_RAW',
+);
+
+%anti_patterns = (
+  q{/ NO_RELAYS /}  => 'NO_RELAYS',
+# q{/ INVALID_MSGID /}  => 'INVALID_MSGID',
+);
+
+my $myrules = <<'END';
+  add_header all  AuthorDomain _AUTHORDOMAIN_
+  blacklist_from  Marilù.Gioffré@esempio-università.it
+  header LT_UTF8SMTP_ANY  Received =~ /\bwith\s*UTF8SMTPS?A?\b/mi
+  score  LT_UTF8SMTP_ANY  -0.1
+  header LT_RPATH   Return-Path:addr =~ /^Marilù\.Gioffré\@esempio-università\.it\z/
+  score  LT_RPATH     0.01
+  header LT_ENVFROM EnvelopeFrom =~ /^Marilù\.Gioffré\@esempio-università\.it\z/
+  score  LT_ENVFROM   0.01
+  header LT_FROM      From =~ /^Marilù Gioffré ♥ <Marilù\.Gioffré\@esempio-università\.it>$/m
+  score  LT_FROM      0.01
+  header LT_FROM_ADDR From:addr =~ /^Marilù\.Gioffré\@esempio-università\.it\z/
+  score  LT_FROM_ADDR 0.01
+  header LT_FROM_NAME From:name =~ /^Marilù Gioffré ♥\z/
+  score  LT_FROM_NAME 0.01
+  header LT_FROM_RAW  From:raw  =~ /^\s*=\?ISO-8859-1\?Q\?Maril=F9\?= Gioffré ♥ <Marilù\.Gioffré\@esempio-università\.it>$/m
+  score  LT_FROM_RAW  0.01
+  header LT_AUTH_DOM  X-AuthorDomain =~ /^xn--esempio-universit-4ob\.it\z/
+  score  LT_AUTH_DOM  0.01
+  header LT_TO_ADDR   To:addr =~ /^Dörte\@Sörensen\.example\.com\z/
+  score  LT_TO_ADDR   0.01
+  header LT_TO_NAME   To:name =~ /^Dörte Å\. Sörensen, Jr\./
+  score  LT_TO_NAME   0.01
+  header LT_CC_ADDR   Cc:addr =~ /^θσερ\@εχαμπλε\.ψομ\z/
+  score  LT_CC_ADDR   0.01
+  header LT_SUBJ      Subject =~ /^Domače omrežje$/m
+  score  LT_SUBJ      0.01
+  header LT_SUBJ_RAW  Subject:raw  =~ /^\s*=\?iso-8859-2\*sl\?Q\?Doma=e8e\?=\s+=\?utf-8\*sl\?Q\?_omre=C5\?=/m
+  score  LT_SUBJ_RAW  0.01
+  header LT_SUBJ2     Subject =~ /^【重要訊息】台電105年3月電費，委託金融機構扣繳成功電子繳費憑證\(電號07487616730\)$/m
+  score  LT_SUBJ2     0.01
+  header LT_SUBJ2_RAW Subject:raw  =~ /^\s*=\?UTF-8\?B\?44CQ6YeN6KaB6KiK5oGv44CR5Y\+w6Zu7MTA15bm0\?=\s*=\?UTF-8\?B\?M\+aciOmbu\+iyu\+\+8jOWnlOiol\+mHkeiejeapn\+ani\+aJow==\?=\s*=\?UTF-8\?B\?57mz5oiQ5Yqf6Zu75a2Q57mz6LK75oaR6K2JKOmbu\+iZnw==\?=\s*=\?UTF-8\?B\?MDc0ODc2MTY3MzAp\?=$/m
+  score  LT_SUBJ2_RAW 0.01
+  header LT_MSGID     Message-ID =~ /^<b497e6c2\@example\.срб>$/m
+  score  LT_MSGID     0.01
+  header LT_MESSAGEID MESSAGEID  =~ /^<b497e6c2\@example\.срб>$/m
+  score  LT_MESSAGEID 0.01
+  header LT_CT        Content-Type =~ /документы для отдела кадров\.pdf/
+  score  LT_CT        0.01
+  header LT_CT_RAW    Content-Type:raw =~ /=\?utf-8\?B\?tdC70LAg0LrQsNC00YDQvtCyLnBkZg==\?="/
+  score  LT_CT_RAW    0.01
+  header LT_SPLIT_UTF8_SUBJ Subject:raw =~ m{(=\?UTF-8) (?: \* [^?=<>, \t]* )? (\?Q\?) [^ ?]* =[89A-F][0-9A-F] \?= \s* \1 (?: \* [^ ?=]* )? \2 =[89AB][0-9A-F]}xsmi
+  score  LT_SPLIT_UTF8_SUBJ 0.01
+  header LT_NOTE      X-Note =~ /^The above.*char =C5 =BE is invalid, .*wild$/m
+  score  LT_NOTE      0.01
+  header LT_ANY_CHARS From =~ /./
+  score  LT_ANY_CHARS 0.001
+  describe         LT_ANY_CHARS  Header contains characters
+  lang fr describe LT_ANY_CHARS  En-tête contient caractères
+  # sorry, Google translate:
+  lang zh describe LT_ANY_CHARS  字符被包含在消息报头部分
+END
+
+if (!$have_libidn) {
+  # temporary fudge to prevent a test failing
+  # until the Net::LibIDN becomes a mandatory module
+  $myrules =~ s{^(\s*header LT_AUTH_DOM\s+X-AuthorDomain =~)\s*(/.*/)$}
+               {$1 /esempio-università\.it/}m
+}
+
+$ENV{PERL_BADLANG} = 0;  # suppresses Perl warning about failed locale setting
+# see Mail::SpamAssassin::Conf::Parser::parse(), also Bug 6992
+$ENV{LANGUAGE} = $ENV{LANG} = 'fr_CH.UTF-8';
+
+#--- normalize_charset 1
+
+tstlocalrules ($myrules . '
+  report_safe 0
+  normalize_charset 1
+');
+
+%patterns = (%mypatterns, %mypatterns_mime_qp);
+sarun ("-L < data/nice/unicode1", \&patterns_run_cb);
+ok_all_patterns();
+
+tstlocalrules ($myrules . '
+  report_safe 1
+  normalize_charset 1
+');
+%patterns = (%mypatterns, %mypatterns_utf8);
+sarun ("-L < data/nice/unicode1", \&patterns_run_cb);
+ok_all_patterns();
+
+tstlocalrules ($myrules . '
+  report_safe 2
+  normalize_charset 1
+');
+%patterns = (%mypatterns, %mypatterns_utf8);
+sarun ("-L < data/nice/unicode1", \&patterns_run_cb);
+ok_all_patterns();
+
+#--- normalize_charset 0
+
+tstlocalrules ($myrules . '
+  report_safe 0
+  normalize_charset 0
+');
+%patterns = (%mypatterns, %mypatterns_mime_qp);
+sarun ("-L < data/nice/unicode1", \&patterns_run_cb);
+ok_all_patterns();
+
+tstlocalrules ($myrules . '
+  report_safe 1
+  normalize_charset 0
+');
+%patterns = (%mypatterns, %mypatterns_utf8);
+sarun ("-L < data/nice/unicode1", \&patterns_run_cb);
+ok_all_patterns();
+
+tstlocalrules ($myrules . '
+  report_safe 2
+  normalize_charset 0
+');
+%patterns = (%mypatterns, %mypatterns_utf8);
+sarun ("-L < data/nice/unicode1", \&patterns_run_cb);
+ok_all_patterns();
+
+#--- base64 encoded-words
+
+$ENV{PERL_BADLANG} = 0;  # suppresses Perl warning about failed locale setting
+# see Mail::SpamAssassin::Conf::Parser::parse(), also Bug 6992
+$ENV{LANGUAGE} = $ENV{LANG} = 'zh_CN.UTF-8';
+
+tstlocalrules ($myrules . '
+  report_safe 0
+  normalize_charset 1
+');
+%patterns = (%mypatterns, %mypatterns_mime_b64);
+sarun ("-L < data/nice/unicode1", \&patterns_run_cb);
+ok_all_patterns();
+
+#--- base64 encoded-words - Bug 7307
+
+$ENV{LANGUAGE} = $ENV{LANG} = 'en_US.UTF-8';
+
+tstlocalrules ($myrules . '
+  report_safe 0
+  normalize_charset 1
+');
+%patterns = (%mypatterns_mime_b64_bug7307);
+%anti_patterns = ();
+sarun ("-L < data/nice/unicode2", \&patterns_run_cb);
+ok_all_patterns();