URI blacklist Matcher

Michael Bryant Fri, 31 Dec 2004 11:41:19 -0800

Hello,

I have developed a Matcher that works with URI blacklists (see http://www.surbl.org). This Matcher scans the body of the message for domain names. Then domain lookups are performed against supplied URI blacklists (Spamcop call these "spamvertised" websites). If a hit is found on any of the domains, all recipients are returned by the Matcher.

If there is interest, I would like to contribute this code to the James project. There may be issues with the way the code and TLD data is currently organized, the use of java.util.regex, etc. I will be happy to help out with any additional work that might need to be done.

-Mike Bryant.

/***********************************************************************
 * Copyright (c) 2004 Michael Bryant                     .             *
 * All rights reserved.                                                *
 * ------------------------------------------------------------------- *
 * Licensed under the Apache License, Version 2.0 (the "License"); you *
 * may not use this file except in compliance with the License. You    *
 * may obtain a copy of the License at:                                *
 *                                                                     *
 *     http://www.apache.org/licenses/LICENSE-2.0                      *
 *                                                                     *
 * Unless required by applicable law or agreed to in writing, software *
 * distributed under the License is distributed on an "AS IS" BASIS,   *
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or     *
 * implied.  See the License for the specific language governing       *
 * permissions and limitations under the License.                      *
 * ------------------------------------------------------------------- *
 * This software contains code derived from the Apache James Project.  *
 ***********************************************************************/


package net._4mi.james.matchers;

import org.apache.mailet.GenericMatcher;
import org.apache.mailet.Mail;

import net._4mi.james.matchers.util.URIScanner;

import java.util.Collection;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.StringTokenizer;
import java.net.UnknownHostException;
import java.io.IOException;
import javax.mail.MessagingException;
import javax.mail.internet.MimeBodyPart;
import javax.mail.internet.MimeMessage;
import javax.mail.internet.MimeMultipart;
import javax.mail.internet.MimePart;

/**
 * Scans the message body for URIs, then checks them against a set of 
 * URI spammer blacklists. (See http://www.surbl.org.)
 *
 * Example:
 * &lt;mailet match="InURIBlackLists=sc.surbl.org,ab.surbl.org" 
class="ToProcessor"&gt;
 *   &lt;processor&gt;spam&lt;/processor&gt;
 * &lt;/mailet&gt;
 *
 */
public class InURIBlacklists extends GenericMatcher {

    /** Set of URI black lists to use */
    private final ArrayList uribls = new ArrayList();
    
    /**
     * Initializes this Matcher with a list URI blacklist domains
     */
    public void init() throws javax.mail.MessagingException {
        StringTokenizer st = new StringTokenizer(getCondition(), ", \t", false);
        while (st.hasMoreTokens()) {
            uribls.add(st.nextToken());
        }  
        log("uribls="+uribls);
    }
    
    /**
     * 
     * Scans the message body for URIs, then checks them against a set of 
     * URI blacklists. (See http://www.surbl.org.) If any hits are found, all
     * recipients are returned.
     *
     * @param mail the Mail object which contains a MimeMessage and routing
     *        information
     * @return a Collection of recipients. If any URI hits are found, returns
     *         all of the mail's recipients, returns null otherwise.
     * @throws javax.mail.MessagingException - if an exception occurs that
     *         interferes with the mailet's normal operation occurred
     */
    public Collection match(Mail mail)
    throws MessagingException {
        MimeMessage message = mail.getMessage();
        log("doing URIBL lookup on mail w/ subject: 
\""+message.getSubject()+"\"");
        HashSet domains = new HashSet(20);
        try {
            scanMailForDomains(domains, message);
        }
        catch (IOException ioe) {
            throw new MessagingException("Could not read MimeMessage", ioe);
        }
        //log(stack2string(new Exception("foo")));
        log("found domains: "+domains);
        for (Iterator i=domains.iterator(); i.hasNext();) {
            String domain = (String)i.next();
            log("looking up: \""+domain+"\"");
            for (Iterator j=uribls.iterator(); j.hasNext();) {
                String uribl = (String)j.next();
                log("using uribl: \""+uribl+"\"");
                String target = domain + "." + uribl;
                log("target: \""+target+"\"");
                try {
                    org.apache.james.dnsserver.DNSServer.getByName(target);
                    log("got a hit: \""+target+"\"");
                    return mail.getRecipients();
                } 
                catch (UnknownHostException uhe) {
                    // domain not found. keep processing
                }
            }
        }
        log("no spammy URIs");
        return null;
    }
    
    /**
     * Recursively scans all MimeParts of an email for domain strings. Domain
     * strings that are found are added to the supplied HashSet.
     *
     * @param domains HashSet for accumulating domain strings
     * @param part MimePart to scan
     */
    protected void scanMailForDomains(HashSet domains, MimePart part)
    throws MessagingException, IOException {
        log("    mime type is: \""+part.getContentType()+"\"");
        if (part.isMimeType("text/plain") || part.isMimeType("text/html")) {
            log("    scanning: \""+part.getContent().toString()+"\"");
            URIScanner.scanContentForDomains(domains, 
part.getContent().toString());
        }
        else if (part.isMimeType("multipart/*")) {
            MimeMultipart multipart = (MimeMultipart)part.getContent();
            int count = multipart.getCount();
            log("    multipart count is: "+count);
            for (int index=0; index<count; index++) {
                log("    recursing index: "+index);
                MimeBodyPart mimeBodyPart = 
(MimeBodyPart)multipart.getBodyPart(index);
                scanMailForDomains(domains, mimeBodyPart);
            }
        }
    }
    
    /**  
    static protected String stack2string(Exception e) {
        try {
            java.io.StringWriter sw = new java.io.StringWriter();
            java.io.PrintWriter pw = new java.io.PrintWriter(sw);
            e.printStackTrace(pw);
            return "------" + sw.toString() + "------";
        }
        catch(Exception e2) {
            return "bad stack2string";
        }
    }
    **/
}

/***********************************************************************
 * Copyright (c) 2004 Michael Bryant                     .             *
 * All rights reserved.                                                *
 * ------------------------------------------------------------------- *
 * Licensed under the Apache License, Version 2.0 (the "License"); you *
 * may not use this file except in compliance with the License. You    *
 * may obtain a copy of the License at:                                *
 *                                                                     *
 *     http://www.apache.org/licenses/LICENSE-2.0                      *
 *                                                                     *
 * Unless required by applicable law or agreed to in writing, software *
 * distributed under the License is distributed on an "AS IS" BASIS,   *
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or     *
 * implied.  See the License for the specific language governing       *
 * permissions and limitations under the License.                      *
 * ------------------------------------------------------------------- *
 * This software contains regular expression code derived from the     *
 * Apache Spmassassin Project.                                         *
 ***********************************************************************/

package net._4mi.james.matchers.util;

import java.util.HashSet;
import java.util.Iterator;
import java.util.StringTokenizer;
import java.util.regex.*;
import java.net.URI;
import java.net.UnknownHostException;
import java.io.IOException;

public class URIScanner {
    /* These regular expressions "inspired" by Spamassassin */
    static private final String reserved = ";/?:@&=+$,[]\\#|";
    static private final String reservedNoColon = ";/?@&=+$,[]\\#|";
    static private final String mark = "-_.!~*'()";
    static private final String unreserved = "A-Za-z0-9" + escape(mark) + 
"\\x00-\\x08\\x0b\\x0c\\x0e-\\x1f";
    static private final String uricSet = escape(reserved) + unreserved + "%";
    static private final String uricNoColon = escape(reservedNoColon) + 
unreserved + "%";    
    static private final String schemeRE = 
"(?-xism:(?:https?|ftp|mailto|javascript|file))";
    static private final String schemelessRE = 
"(?-xism:(?<![.=])(?:(?i)www\\d*\\.|(?i)ftp\\.))";
    static private final String uriRE = 
"(?-xism:\\b(?:"+schemeRE+":["+uricNoColon+"]|"+schemelessRE+")["+uricSet+"#]*)";
    
    /** Pre-compiled pattern that matches URIs */
    static private final Pattern uriPattern = Pattern.compile(uriRE);
    
    /** Pre-compiled pattern that matches URI scheme strings */
    static private final Pattern schemePattern = 
Pattern.compile("^"+schemeRE+":");
    
    /** Pre-compiled pattern used to cleanup a found URI string */
    static private final Pattern uriCleanup = Pattern.compile("^<(.*)>$");
    
    /** Pre-compiled pattern used to cleanup a found URI string */
    static private final Pattern uriCleanup2 = Pattern.compile("[\\]\\)>#]$");
    
    /** Pre-compile pattern for identifying "mailto" patterns */
    static private final Pattern uriCleanup3 = 
Pattern.compile("^(?i)mailto:([^\\/]{2})(.*)$");
    
    /* These regular expressions also "inspired" by Spamassassin */
    static private final String esc = "\\\\";
    static private final String period = "\\.";
    static private final String space = "\\040";
    static private final String open_br = "\\[";
    static private final String close_br = "\\]";
        static private final String nonASCII = "\\x80-\\xff";
        static private final String ctrl = "\\000-\\037";
        static private final String cr_list = "\\n\\015";
        static private final String qtext = "[^"+esc+nonASCII+cr_list+"\"]";
        static private final String dtext = 
"[^"+esc+nonASCII+cr_list+open_br+close_br+"]";
        static private final String quoted_pair = esc+"[^"+nonASCII+"]";
        static private final String atom_char = 
"[^("+space+")<>@,;:\"."+esc+open_br+close_br+ctrl+nonASCII+"]";
        static private final String atom = "(?>"+atom_char+"+)";
        static private final String quoted_str = 
"\""+qtext+"*(?:"+quoted_pair+qtext+"*)*\"";
        static private final String word = "(?:"+atom+"|"+quoted_str+")";
        static private final String local_part = word+"(?:"+period+word+")*";
        static private final String label       = 
"[A-Za-z\\d](?:[A-Za-z\\d-]*[A-Za-z\\d])?";
        static private final String domain_ref  = label+"(?:"+period+label+")*";
        static private final String domain_lit  = 
open_br+"(?:"+dtext+"|"+quoted_pair+")*"+close_br;
        static private final String domain      = 
"(?:"+domain_ref+"|"+domain_lit+")";
        static private final String Addr_spec_re   = 
"(?-xism:"+local_part+"[EMAIL PROTECTED]"+domain+")";
    
    /** Pre-compiled pattern for matching "schemeless" mailto strings */
    static private final Pattern emailAddrPattern = 
Pattern.compile(Addr_spec_re);
    
    /** Simple reqular expression to match an octet part of an IP address */
    static private final String octet = 
"(?:[1-2][0-9][0-9])|(?:[1-9][0-9])|(?:[0-9])";
    
    /** Simple regular expression to match a part of a domain string in the
        TLDLookup cache. */
    static private final String tld = "[A-Za-z0-9\\-]*";
    
    /** Simple regular expression that matches a two-part TLD */
    static private final String tld2 = tld+"\\."+tld;
        
    /** Simple regular expression that matches a three-part TLD */
    static private final String tld3 = tld+"\\."+tld+"\\."+tld;
    
    /** Regular expression that matches and captures parts of a possible 
        one-part TLD domain string */
    static private final String tldCap = "("+tld+"\\.("+tld+"))$";
    
    /** Regular expression that matches and captures parts of a possible 
        two-part TLD domain string */
    static private final String tld2Cap = "("+tld+"\\.("+tld2+"))$";
    
    /** Regular expression that matches and captures parts of a possible 
        three-part TLD domain string */
    static private final String tld3Cap = "("+tld+"\\.("+tld3+"))$";
    
    /** Regular expression that matches and captures parts of an IP address */
    static private final String ipCap = 
"(("+octet+")\\.("+octet+")\\.("+octet+")\\.("+octet+"))$";
    
    /** Pre-compiled pattern that matches IP addresses */
    static private final Pattern ipCapPattern = Pattern.compile(ipCap);
    
    /** Pre-compiled pattern that matches domain string that is possibly
        contained in a one-part TLD */
    static private final Pattern tldCapPattern = Pattern.compile(tldCap);

    /** Pre-compiled pattern that matches domain string that is possibly
        contained in a two-part TLD */
    static private final Pattern tld2CapPattern = Pattern.compile(tld2Cap);

    /** Pre-compiled pattern that matches domain string that is possibly
        contained in a three-part TLD */
    static private final Pattern tld3CapPattern = Pattern.compile(tld3Cap);
    
    /** controls testing/debug output */
    static private boolean testing = false;
    
    /**
     * Scans a character sequence for URIs. Then add all unique domain strings 
     * derived from those found URIs to the supplied HashSet.
     * <p>
     * This function calls scanContentForHosts() to grab all the host strings.
     * Then it calls domainFromHost() on each host string found to distill them
     * to their basic "registrar" domains. 
     *
     * @param domains a HashSet to be populated with all domain strings found in
     *        the content
     * @param content a character sequence to be scanned for URIs
     */
    static public void scanContentForDomains(HashSet domains, CharSequence 
content) {
        HashSet hosts = scanContentForHosts(content);
        for (Iterator i = hosts.iterator(); i.hasNext();) {
            String domain = domainFromHost((String)i.next());
            if (null != domain) {
                if (false == domains.contains(domain)) {
                    domains.add(domain);
                }
            }
        }
    }
    
    /**
     * Scans a character sequence for URIs. Then returns all unique host 
strings 
     * derived from those found URIs in a HashSet
     *
     * @param content a character sequence to be scanned for URIs
     * @return a HashSet containing host strings
     */
    static protected HashSet scanContentForHosts(CharSequence content) {
        HashSet set = new HashSet();
        try {
            // look for URIs
            Matcher mat = uriPattern.matcher(content);
            while (mat.find()) {
                String found = mat.group();
                Matcher cleanMat = uriCleanup.matcher(found);
                if (cleanMat.find()) {
                    found = cleanMat.group(1);
                }
                cleanMat = uriCleanup2.matcher(found);
                if (cleanMat.find()) {
                    found = cleanMat.replaceAll("");
                }
                cleanMat = uriCleanup3.matcher(found);
                if (cleanMat.find()) {
                    found = "mailto://"+cleanMat.group(1)+cleanMat.group(2);
                }
                cleanMat = schemePattern.matcher(found);
                if (!cleanMat.find()) {
                    if (found.matches("^(?i)www\\d*\\..*")) {
                        found = "http://"; + found;
                    }
                    else if (found.matches("^(?i)ftp\\..*")) {
                        found = "ftp://"; + found;
                    }
                }
                String host = hostFromUriStr(found);
                if (null != host) {
                    host = host.toLowerCase();
                    if (false == set.contains(host)) {
                        set.add(host);
                    }
                }
            }
            
            // look for "schemeless" email addresses, too
            mat = emailAddrPattern.matcher(content);
            while (mat.find()) {
                String found = mat.group();
                debugOut("******** mailfound=\""+found+"\"");
                found = "mailto://"+found;
                debugOut("*******6 mailfoundfound=\""+found+"\" after cleanup 
6");
                String host = hostFromUriStr(found);
                if (null != host) {
                    host = host.toLowerCase();
                    if (false == set.contains(host)) {
                        set.add(host);
                    }
                }
            }
        }
        catch (Exception ex) {
            debugOut(ex.toString());
            ex.printStackTrace();
        }
        return set;
    }
    
    /**
     * Extracts and returns the host portion of URI string.
     *
     * This function uses java.net.URI.
     *
     * @param uriStr a string containing a URI
     * @return the host portion of the supplied URI, null if no host string
     *         could be found
     */
    static protected String hostFromUriStr(String uriStr) {
        debugOut("hostFromUriStr(\""+uriStr+"\")");
        String host = null;
        try {
            URI uri = new URI(uriStr);
            host = uri.getHost();
        }
        catch (Exception ex) {
        }
        return host;
    }
    
    /**
     * Extracts and returns the registrar domain portion of a host string. This
     * funtion checks all known multi-part TLDs to make sure that registrar
     * domain is complete. For example, if the supplied host string is
     * "subdomain.example.co.uk", the TLD is "co.uk" and not "uk". Therefore,
     * the correct registrar domain is not "co.uk", but "example.co.uk". If the
     * domain string is an IP address, then the octets are returned in reverse
     * order.
     *
     * @param host a string containing a host name
     * @return the registrar domain portion of the supplied host string
     */
    static protected String domainFromHost(String host) {
        debugOut("domainFromHost(\""+host+"\")");
        String domain = null;
        Matcher mat;
        try {
            
            // IP addrs 
            mat = ipCapPattern.matcher(host);
            if (mat.find()) {
                // reverse the octets now
                domain = 
mat.group(5)+"."+mat.group(4)+"."+mat.group(3)+"."+mat.group(2);
                debugOut("domain=\""+domain+"\"");
                return domain;
            }
            
            // 3-part TLDs
            mat = tld3CapPattern.matcher(host);
            if (mat.find()) {
                String tld = mat.group(2);
                if (TLDLookup.isThreePartTLD(tld)) {
                    domain = mat.group(1);
                    debugOut("domain=\""+domain+", tld=\""+tld+"\"");
                    return domain;
                }
            }
            
            // 2-part TLDs
            mat = tld2CapPattern.matcher(host);
            if (mat.find()) {
                String tld = mat.group(2);
                if (TLDLookup.isTwoPartTLD(tld)) {
                    domain = mat.group(1);
                    debugOut("domain=\""+domain+", tld=\""+tld+"\"");
                    return domain;
                }
            }
            
            // 1-part TLDs
            mat = tldCapPattern.matcher(host);
            if (mat.find()) {
                String tld = mat.group(2);
                domain = mat.group(1);
                debugOut("domain=\""+domain+", tld=\""+tld+"\"");
                return domain;
            }
        }
        catch (Exception ex) {
            debugOut(ex.toString());
            ex.printStackTrace();
        }
        return domain;
    }
    
    /**
     * Debugging output
     */
    private static void debugOut(String msg) {
        if (true == testing) {
            System.out.println(msg);
        }
    }
    
    /**
     * Test driver
     */
    public static void main(String args[]) {
        testing = true;

        String str = "jhl http://123.234.12.34/foo.html kh mailto:[EMAIL 
PROTECTED] jlksjl <http://Www.foo.org> hkjhkjhk kljhlkj www3.foobar.org kjhk  
wWw.foojar.org jh  fTp.foot.com lhjhkj h www.foo.org";

        debugOut("str=\""+str+"\"");
        
        HashSet domains = new HashSet();
        scanContentForDomains(domains, str);
        for (Iterator i=domains.iterator(); i.hasNext();) {
            String domain = (String)i.next();
            debugOut("domain = "+domain);
        }
    }
    
    /**
     * A utility function that "escapes" special characters in a string.
     *
     * @param str a string to be processed
     * @return modified "escaped" string
     */
    private static String escape(String str) {
        StringBuffer buffer = new StringBuffer();
        for (int i=0; i<str.length(); i++) {
            char ch = str.charAt(i);
            if (Character.isDigit(ch) || Character.isUpperCase(ch) || 
Character.isLowerCase(ch) || ch == '_') {
                buffer.append(ch);
            }
            else {
                buffer.append("\\");
                buffer.append(ch);
            }
        }
        return buffer.toString();
    }
}

/***********************************************************************
 * Copyright (c) 2004 Michael Bryant                     .             *
 * All rights reserved.                                                *
 * ------------------------------------------------------------------- *
 * Licensed under the Apache License, Version 2.0 (the "License"); you *
 * may not use this file except in compliance with the License. You    *
 * may obtain a copy of the License at:                                *
 *                                                                     *
 *     http://www.apache.org/licenses/LICENSE-2.0                      *
 *                                                                     *
 * Unless required by applicable law or agreed to in writing, software *
 * distributed under the License is distributed on an "AS IS" BASIS,   *
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or     *
 * implied.  See the License for the specific language governing       *
 * permissions and limitations under the License.                      *
 ***********************************************************************/

package net._4mi.james.matchers.util;
import java.util.HashSet;

/**
 * A utility class that caches sets of multi-part top level domains (TLDs) for
 * quick lookup.
 */
public class TLDLookup {
    
    /** Simple regular expression to match strings in the cache. Note: if the
        collection of known mult-part TLDs change to contain characters other 
        than these, this string must be modified. */
    static private final String tld = "[A-Za-z0-9\\-]*";
    
    /** Simple regular expression that matches a two-part TLD */
    static private final String tld2 = tld+"\\."+tld;
    
    /** Simple regular expression that matches a three-part TLD */
    static private final String tld3 = tld+"\\."+tld+"\\."+tld;
    
    /** Array of all known multi-level TLDs */
    static private final String[] multiPartTLDs = initMultiPartTLDs();
    
    /** A set of all known two-part TLDs */
    static private final HashSet twoPartTLDs = initTwoPartTLDs();
    
    /** A set of all known three-part TLDs */
    static private final HashSet threePartTLDs = initThreePartTLDs();

    /** controls testing/debug output */
    static private boolean testing = false;
    
    /**
     * Determines if a two-part domain string (xxx.xxx) is contained in the 
     * cache of known two-part TLDs.
     *
     * @param domain a String representing a two-part domain
     * @return true if the domain string is found in the cache, false otherwise
     */
    static public boolean isTwoPartTLD(String domain) {
        return twoPartTLDs.contains(domain);
    }
    
    /**
     * Determines if a three-part domain string (xxx.xxx.xxx) is contained in
     * the cache of known three-part TLDs.
     *
     * @param domain a String representing a three-part domain
     * @return true if the domain string is found in the cache, false otherwise
     */
    static public boolean isThreePartTLD(String domain) {
        return threePartTLDs.contains(domain);
    }
    
    /**
     * Initialize two-part top-level domain cache.
     *
     * @return a HashSet containing all known two-part TLDs
     */
    static private HashSet initTwoPartTLDs() {
        HashSet set = new HashSet(900);
        for (int i=0; i<multiPartTLDs.length; i++) {
            try {
                if (multiPartTLDs[i].matches("^"+tld2+"$")) {
                    set.add(multiPartTLDs[i]);
                }
            }
            catch (Exception ex) {
                debugOut(ex);
            }
        }
        debugOut("initTwoPartTLDs size="+set.size());
        return set;
    }

    /**
     * Initialize three-part top-level domain cache.
     *
     * @return a HashSet containing all known three-part TLDs
     */
    static private HashSet initThreePartTLDs() {
        HashSet set = new HashSet();
        for (int i=0; i<multiPartTLDs.length; i++) {
            try {
                if (multiPartTLDs[i].matches("^"+tld3+"$")) {
                    debugOut("adding \"" + multiPartTLDs[i] + "\"");
                    set.add(multiPartTLDs[i]);
                }
            }
            catch (Exception ex) {
                debugOut(ex);
            }
        }
        debugOut("initThreePartTLDs size="+set.size());
        return set;
    }
    
    /**
     * Initialize an array of Strings containing all known multi-part TLDs
     *
     * @return an array of all known multi-part TLDs
     */
    static private String[] initMultiPartTLDs() {
        String[] tmp = new String[] {
            "com.ac",
            "edu.ac",
            "gov.ac",
            "edu.ai",
            "gov.ai",
            "com.ar",
            "net.ar",
            "org.ar",
            "gov.ar",
            "mil.ar",
            "edu.ar",
            "int.ar",
            "co.at",
            "ac.at",
            "or.at",
            "gv.at",
            "priv.at",
            "com.au",
            "gov.au",
            "org.au",
            "edu.au",
            "id.au",
            "oz.au",
            "info.au",
            "net.au",
            "asn.au",
            "csiro.au",
            "telememo.au",
            "conf.au",
            "otc.au",
            "com.az",
            "net.az",
            "org.az",
            "com.bb",
            "net.bb",
            "org.bb",
            "ac.be",
            "belgie.be",
            "dns.be",
            "fgov.be",
            "com.bh",
            "gov.bh",
            "net.bh",
            "edu.bh",
            "org.bh",
            "com.bm",
            "edu.bm",
            "gov.bm",
            "org.bm",
            "net.bm",
            "adm.br",
            "adv.br",
            "agr.br",
            "am.br",
            "arq.br",
            "art.br",
            "ato.br",
            "bio.br",
            "bmd.br",
            "cim.br",
            "cng.br",
            "cnt.br",
            "com.br",
            "coop.br",
            "ecn.br",
            "edu.br",
            "eng.br",
            "esp.br",
            "etc.br",
            "eti.br",
            "far.br",
            "fm.br",
            "fnd.br",
            "fot.br",
            "fst.br",
            "g12.br",
            "ggf.br",
            "gov.br",
            "imb.br",
            "ind.br",
            "inf.br",
            "jor.br",
            "lel.br",
            "mat.br",
            "med.br",
            "mil.br",
            "mus.br",
            "net.br",
            "nom.br",
            "not.br",
            "ntr.br",
            "odo.br",
            "org.br",
            "ppg.br",
            "pro.br",
            "psc.br",
            "psi.br",
            "qsl.br",
            "rec.br",
            "slg.br",
            "srv.br",
            "tmp.br",
            "trd.br",
            "tur.br",
            "tv.br",
            "vet.br",
            "zlg.br",
            "com.bs",
            "net.bs",
            "org.bs",
            "ab.ca",
            "bc.ca",
            "mb.ca",
            "nb.ca",
            "nf.ca",
            "nl.ca",
            "ns.ca",
            "nt.ca",
            "nu.ca",
            "on.ca",
            "pe.ca",
            "qc.ca",
            "sk.ca",
            "yk.ca",
            "co.ck",
            "net.ck",
            "org.ck",
            "edu.ck",
            "gov.ck",
            "com.cn",
            "edu.cn",
            "gov.cn",
            "net.cn",
            "org.cn",
            "ac.cn",
            "ah.cn",
            "bj.cn",
            "cq.cn",
            "gd.cn",
            "gs.cn",
            "gx.cn",
            "gz.cn",
            "hb.cn",
            "he.cn",
            "hi.cn",
            "hk.cn",
            "hl.cn",
            "hn.cn",
            "jl.cn",
            "js.cn",
            "ln.cn",
            "mo.cn",
            "nm.cn",
            "nx.cn",
            "qh.cn",
            "sc.cn",
            "sn.cn",
            "sh.cn",
            "sx.cn",
            "tj.cn",
            "tw.cn",
            "xj.cn",
            "xz.cn",
            "yn.cn",
            "zj.cn",
            "arts.co",
            "com.co",
            "edu.co",
            "firm.co",
            "gov.co",
            "info.co",
            "int.co",
            "nom.co",
            "mil.co",
            "org.co",
            "rec.co",
            "store.co",
            "web.co",
            "ac.cr",
            "co.cr",
            "ed.cr",
            "fi.cr",
            "go.cr",
            "or.cr",
            "sa.cr",
            "com.cu",
            "net.cu",
            "org.cu",
            "ac.cy",
            "com.cy",
            "gov.cy",
            "net.cy",
            "org.cy",
            "co.dk",
            "art.do",
            "com.do",
            "edu.do",
            "gov.do",
            "org.do",
            "mil.do",
            "net.do",
            "web.do",
            "com.dz",
            "org.dz",
            "net.dz",
            "gov.dz",
            "edu.dz",
            "ass.dz",
            "pol.dz",
            "art.dz",
            "com.ec",
            "k12.ec",
            "edu.ec",
            "fin.ec",
            "med.ec",
            "gov.ec",
            "mil.ec",
            "org.ec",
            "net.ec",
            "com.eg",
            "edu.eg",
            "eun.eg",
            "gov.eg",
            "net.eg",
            "org.eg",
            "sci.eg",
            "com.er",
            "net.er",
            "org.er",
            "edu.er",
            "mil.er",
            "gov.er",
            "ind.er",
            "com.et",
            "gov.et",
            "org.et",
            "edu.et",
            "net.et",
            "biz.et",
            "name.et",
            "info.et",
            "ac.fj",
            "com.fj",
            "gov.fj",
            "id.fj",
            "org.fj",
            "school.fj",
            "com.fk",
            "ac.fk",
            "gov.fk",
            "net.fk",
            "nom.fk",
            "org.fk",
            "asso.fr",
            "nom.fr",
            "barreau.fr",
            "com.fr",
            "prd.fr",
            "presse.fr",
            "tm.fr",
            "aeroport.fr",
            "assedic.fr",
            "avocat.fr",
            "avoues.fr",
            "cci.fr",
            "chambagri.fr",
            "chirurgiens-dentistes.fr",
            "experts-comptables.fr",
            "geometre-expert.fr",
            "gouv.fr",
            "greta.fr",
            "huissier-justice.fr",
            "medecin.fr",
            "notaires.fr",
            "pharmacien.fr",
            "port.fr",
            "veterinaire.fr",
            "com.ge",
            "edu.ge",
            "gov.ge",
            "mil.ge",
            "net.ge",
            "org.ge",
            "pvt.ge",
            "co.gg",
            "org.gg",
            "sch.gg",
            "ac.gg",
            "gov.gg",
            "ltd.gg",
            "ind.gg",
            "net.gg",
            "alderney.gg",
            "guernsey.gg",
            "sark.gg",
            "com.gu",
            "edu.gu",
            "net.gu",
            "org.gu",
            "gov.gu",
            "mil.gu",
            "com.hk",
            "net.hk",
            "org.hk",
            "idv.hk",
            "gov.hk",
            "edu.hk",
            "co.hu",
            "2000.hu",
            "erotika.hu",
            "jogasz.hu",
            "sex.hu",
            "video.hu",
            "info.hu",
            "agrar.hu",
            "film.hu",
            "konyvelo.hu",
            "shop.hu",
            "org.hu",
            "bolt.hu",
            "forum.hu",
            "lakas.hu",
            "suli.hu",
            "priv.hu",
            "casino.hu",
            "games.hu",
            "media.hu",
            "szex.hu",
            "sport.hu",
            "city.hu",
            "hotel.hu",
            "news.hu",
            "tozsde.hu",
            "tm.hu",
            "erotica.hu",
            "ingatlan.hu",
            "reklam.hu",
            "utazas.hu",
            "ac.id",
            "co.id",
            "go.id",
            "mil.id",
            "net.id",
            "or.id",
            "co.il",
            "net.il",
            "org.il",
            "ac.il",
            "gov.il",
            "k12.il",
            "muni.il",
            "idf.il",
            "co.im",
            "net.im",
            "org.im",
            "ac.im",
            "lkd.co.im",
            "gov.im",
            "nic.im",
            "plc.co.im",
            "co.in",
            "net.in",
            "ac.in",
            "ernet.in",
            "gov.in",
            "nic.in",
            "res.in",
            "gen.in",
            "firm.in",
            "mil.in",
            "org.in",
            "ind.in",
            "ac.je",
            "co.je",
            "net.je",
            "org.je",
            "gov.je",
            "ind.je",
            "jersey.je",
            "ltd.je",
            "sch.je",
            "com.jo",
            "org.jo",
            "net.jo",
            "gov.jo",
            "edu.jo",
            "mil.jo",
            "ad.jp",
            "ac.jp",
            "co.jp",
            "go.jp",
            "or.jp",
            "ne.jp",
            "gr.jp",
            "ed.jp",
            "lg.jp",
            "net.jp",
            "org.jp",
            "gov.jp",
            "hokkaido.jp",
            "aomori.jp",
            "iwate.jp",
            "miyagi.jp",
            "akita.jp",
            "yamagata.jp",
            "fukushima.jp",
            "ibaraki.jp",
            "tochigi.jp",
            "gunma.jp",
            "saitama.jp",
            "chiba.jp",
            "tokyo.jp",
            "kanagawa.jp",
            "niigata.jp",
            "toyama.jp",
            "ishikawa.jp",
            "fukui.jp",
            "yamanashi.jp",
            "nagano.jp",
            "gifu.jp",
            "shizuoka.jp",
            "aichi.jp",
            "mie.jp",
            "shiga.jp",
            "kyoto.jp",
            "osaka.jp",
            "hyogo.jp",
            "nara.jp",
            "wakayama.jp",
            "tottori.jp",
            "shimane.jp",
            "okayama.jp",
            "hiroshima.jp",
            "yamaguchi.jp",
            "tokushima.jp",
            "kagawa.jp",
            "ehime.jp",
            "kochi.jp",
            "fukuoka.jp",
            "saga.jp",
            "nagasaki.jp",
            "kumamoto.jp",
            "oita.jp",
            "miyazaki.jp",
            "kagoshima.jp",
            "okinawa.jp",
            "sapporo.jp",
            "sendai.jp",
            "yokohama.jp",
            "kawasaki.jp",
            "nagoya.jp",
            "kobe.jp",
            "kitakyushu.jp",
            "utsunomiya.jp",
            "kanazawa.jp",
            "takamatsu.jp",
            "matsuyama.jp",
            "com.kh",
            "net.kh",
            "org.kh",
            "per.kh",
            "edu.kh",
            "gov.kh",
            "mil.kh",
            "ac.kr",
            "co.kr",
            "go.kr",
            "ne.kr",
            "or.kr",
            "pe.kr",
            "re.kr",
            "seoul.kr",
            "kyonggi.kr",
            "com.kw",
            "net.kw",
            "org.kw",
            "edu.kw",
            "gov.kw",
            "com.la",
            "net.la",
            "org.la",
            "com.lb",
            "org.lb",
            "net.lb",
            "edu.lb",
            "gov.lb",
            "mil.lb",
            "com.lc",
            "edu.lc",
            "gov.lc",
            "net.lc",
            "org.lc",
            "com.lv",
            "net.lv",
            "org.lv",
            "edu.lv",
            "gov.lv",
            "mil.lv",
            "id.lv",
            "asn.lv",
            "conf.lv",
            "com.ly",
            "net.ly",
            "org.ly",
            "co.ma",
            "net.ma",
            "org.ma",
            "press.ma",
            "ac.ma",
            "com.mk",
            "com.mm",
            "net.mm",
            "org.mm",
            "edu.mm",
            "gov.mm",
            "com.mo",
            "net.mo",
            "org.mo",
            "edu.mo",
            "gov.mo",
            "com.mt",
            "net.mt",
            "org.mt",
            "edu.mt",
            "tm.mt",
            "uu.mt",
            "com.mx",
            "net.mx",
            "org.mx",
            "com.my",
            "org.my",
            "gov.my",
            "edu.my",
            "net.my",
            "com.na",
            "org.na",
            "net.na",
            "alt.na",
            "edu.na",
            "cul.na",
            "unam.na",
            "telecom.na",
            "com.nc",
            "net.nc",
            "org.nc",
            "ac.ng",
            "edu.ng",
            "sch.ng",
            "com.ng",
            "gov.ng",
            "org.ng",
            "net.ng",
            "gob.ni",
            "com.ni",
            "net.ni",
            "edu.ni",
            "nom.ni",
            "org.ni",
            "com.np",
            "net.np",
            "org.np",
            "gov.np",
            "edu.np",
            "ac.nz",
            "co.nz",
            "cri.nz",
            "gen.nz",
            "geek.nz",
            "govt.nz",
            "iwi.nz",
            "maori.nz",
            "mil.nz",
            "net.nz",
            "org.nz",
            "school.nz",
            "com.om",
            "co.om",
            "edu.om",
            "ac.om",
            "gov.om",
            "net.om",
            "org.om",
            "mod.om",
            "museum.om",
            "biz.om",
            "pro.om",
            "med.om",
            "com.pa",
            "net.pa",
            "org.pa",
            "edu.pa",
            "ac.pa",
            "gob.pa",
            "sld.pa",
            "edu.pe",
            "gob.pe",
            "nom.pe",
            "mil.pe",
            "org.pe",
            "com.pe",
            "net.pe",
            "com.pg",
            "net.pg",
            "ac.pg",
            "com.ph",
            "net.ph",
            "org.ph",
            "mil.ph",
            "ngo.ph",
            "aid.pl",
            "agro.pl",
            "atm.pl",
            "auto.pl",
            "biz.pl",
            "com.pl",
            "edu.pl",
            "gmina.pl",
            "gsm.pl",
            "info.pl",
            "mail.pl",
            "miasta.pl",
            "media.pl",
            "mil.pl",
            "net.pl",
            "nieruchomosci.pl",
            "nom.pl",
            "org.pl",
            "pc.pl",
            "powiat.pl",
            "priv.pl",
            "realestate.pl",
            "rel.pl",
            "sex.pl",
            "shop.pl",
            "sklep.pl",
            "sos.pl",
            "szkola.pl",
            "targi.pl",
            "tm.pl",
            "tourism.pl",
            "travel.pl",
            "turystyka.pl",
            "com.pk",
            "net.pk",
            "edu.pk",
            "org.pk",
            "fam.pk",
            "biz.pk",
            "web.pk",
            "gov.pk",
            "gob.pk",
            "gok.pk",
            "gon.pk",
            "gop.pk",
            "gos.pk",
            "edu.ps",
            "gov.ps",
            "plo.ps",
            "sec.ps",
            "com.py",
            "net.py",
            "org.py",
            "edu.py",
            "com.qa",
            "net.qa",
            "org.qa",
            "edu.qa",
            "gov.qa",
            "asso.re",
            "com.re",
            "nom.re",
            "com.ru",
            "net.ru",
            "org.ru",
            "pp.ru",
            "com.sa",
            "edu.sa",
            "sch.sa",
            "med.sa",
            "gov.sa",
            "net.sa",
            "org.sa",
            "pub.sa",
            "com.sb",
            "net.sb",
            "org.sb",
            "edu.sb",
            "gov.sb",
            "com.sd",
            "net.sd",
            "org.sd",
            "edu.sd",
            "sch.sd",
            "med.sd",
            "gov.sd",
            "tm.se",
            "press.se",
            "parti.se",
            "brand.se",
            "fh.se",
            "fhsk.se",
            "fhv.se",
            "komforb.se",
            "kommunalforbund.se",
            "komvux.se",
            "lanarb.se",
            "lanbib.se",
            "naturbruksgymn.se",
            "sshn.se",
            "org.se",
            "pp.se",
            "com.sg",
            "net.sg",
            "org.sg",
            "edu.sg",
            "gov.sg",
            "per.sg",
            "com.sh",
            "net.sh",
            "org.sh",
            "edu.sh",
            "gov.sh",
            "mil.sh",
            "gov.st",
            "saotome.st",
            "principe.st",
            "consulado.st",
            "embaixada.st",
            "org.st",
            "edu.st",
            "net.st",
            "com.st",
            "store.st",
            "mil.st",
            "co.st",
            "com.sv",
            "org.sv",
            "edu.sv",
            "gob.sv",
            "red.sv",
            "com.sy",
            "net.sy",
            "org.sy",
            "gov.sy",
            "ac.th",
            "co.th",
            "go.th",
            "net.th",
            "or.th",
            "com.tn",
            "net.tn",
            "org.tn",
            "edunet.tn",
            "gov.tn",
            "ens.tn",
            "fin.tn",
            "nat.tn",
            "ind.tn",
            "info.tn",
            "intl.tn",
            "rnrt.tn",
            "rnu.tn",
            "rns.tn",
            "tourism.tn",
            "com.tr",
            "net.tr",
            "org.tr",
            "edu.tr",
            "gov.tr",
            "mil.tr",
            "bbs.tr",
            "k12.tr",
            "gen.tr",
            "co.tt",
            "com.tt",
            "org.tt",
            "net.tt",
            "biz.tt",
            "info.tt",
            "pro.tt",
            "name.tt",
            "gov.tt",
            "edu.tt",
            "nic.tt",
            "us.tt",
            "uk.tt",
            "ca.tt",
            "eu.tt",
            "es.tt",
            "fr.tt",
            "it.tt",
            "se.tt",
            "dk.tt",
            "be.tt",
            "de.tt",
            "at.tt",
            "au.tt",
            "co.tv",
            "com.tw",
            "net.tw",
            "org.tw",
            "edu.tw",
            "idv.tw",
            "gove.tw",
            "com.ua",
            "net.ua",
            "org.ua",
            "edu.ua",
            "gov.ua",
            "ac.ug",
            "co.ug",
            "or.ug",
            "go.ug",
            "co.uk",
            "me.uk",
            "org.uk",
            "edu.uk",
            "ltd.uk",
            "plc.uk",
            "net.uk",
            "sch.uk",
            "nic.uk",
            "ac.uk",
            "gov.uk",
            "nhs.uk",
            "police.uk",
            "mod.uk",
            "dni.us",
            "fed.us",
            "com.uy",
            "edu.uy",
            "net.uy",
            "org.uy",
            "gub.uy",
            "mil.uy",
            "com.ve",
            "net.ve",
            "org.ve",
            "co.ve",
            "edu.ve",
            "gov.ve",
            "mil.ve",
            "arts.ve",
            "bib.ve",
            "firm.ve",
            "info.ve",
            "int.ve",
            "nom.ve",
            "rec.ve",
            "store.ve",
            "tec.ve",
            "web.ve",
            "co.vi",
            "net.vi",
            "org.vi",
            "com.vn",
            "biz.vn",
            "edu.vn",
            "gov.vn",
            "net.vn",
            "org.vn",
            "int.vn",
            "ac.vn",
            "pro.vn",
            "info.vn",
            "health.vn",
            "name.vn",
            "com.vu",
            "edu.vu",
            "net.vu",
            "org.vu",
            "de.vu",
            "ch.vu",
            "fr.vu",
            "com.ws",
            "net.ws",
            "org.ws",
            "gov.ws",
            "edu.ws",
            "ac.yu",
            "co.yu",
            "edu.yu",
            "org.yu",
            "com.ye",
            "net.ye",
            "org.ye",
            "gov.ye",
            "edu.ye",
            "mil.ye",
            "ac.za",
            "alt.za",
            "bourse.za",
            "city.za",
            "co.za",
            "edu.za",
            "gov.za",
            "law.za",
            "mil.za",
            "net.za",
            "ngo.za",
            "nom.za",
            "org.za",
            "school.za",
            "tm.za",
            "web.za",
            "co.zw",
            "ac.zw",
            "org.zw",
            "gov.zw",
            "eu.org",
            "au.com",
            "br.com",
            "cn.com",
            "de.com",
            "de.net",
            "eu.com",
            "gb.com",
            "gb.net",
            "hu.com",
            "no.com",
            "qc.com",
            "ru.com",
            "sa.com",
            "se.com",
            "uk.com",
            "uk.net",
            "us.com",
            "uy.com",
            "za.com",
            "dk.org",
            "tel.no",
            "fax.nr",
            "mob.nr",
            "mobil.nr",
            "mobile.nr",
            "tel.nr",
            "tlf.nr",
            "e164.arpa"
        };
        debugOut("array size=" + tmp.length);
        return tmp;
    }
    
    /**
     * Debugging output
     */
    private static void debugOut(String msg) {
        if (true == testing) {
            System.out.println(msg);
        }
    }
    
    /**
     * Debugging output
     */
    private static void debugOut(Throwable th) {
        if (true == testing) {
            System.out.println(th);
        }
    }
    
    /**
     * Test driver
     */
    public static void main(String args[]) {
        testing = true;
        
        String[] test2 = new String[] {
            "woof.com",
            "co.uk",
            "lkd.co.im",
            "gov.qa",
        };
        String[] test3 = new String[] {
            "woof.woof.com",
            "lkd.co.im",
            "gov.qa",
        };
        int i;
        debugOut("2 part TLDs --------");
        for (i=0; i<test2.length; i++) {
            debugOut(test2[i]+" found is: "+isTwoPartTLD(test2[i]));
        }
        debugOut("3 part TLDs --------");
        for (i=0; i<test3.length; i++) {
            debugOut(test3[i]+" found is: "+isThreePartTLD(test3[i]));
        }
    }
}

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

URI blacklist Matcher

Reply via email to