Hello,
I have developed a Matcher that works with URI blacklists (see http://www.surbl.org). This Matcher scans the body of the message for domain names. Then domain lookups are performed against supplied URI blacklists (Spamcop call these "spamvertised" websites). If a hit is found on any of the domains, all recipients are returned by the Matcher.
If there is interest, I would like to contribute this code to the James project. There may be issues with the way the code and TLD data is currently organized, the use of java.util.regex, etc. I will be happy to help out with any additional work that might need to be done.
-Mike Bryant.
/*********************************************************************** * Copyright (c) 2004 Michael Bryant . * * All rights reserved. * * ------------------------------------------------------------------- * * Licensed under the Apache License, Version 2.0 (the "License"); you * * may not use this file except in compliance with the License. You * * may obtain a copy of the License at: * * * * http://www.apache.org/licenses/LICENSE-2.0 * * * * Unless required by applicable law or agreed to in writing, software * * distributed under the License is distributed on an "AS IS" BASIS, * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * * implied. See the License for the specific language governing * * permissions and limitations under the License. * * ------------------------------------------------------------------- * * This software contains code derived from the Apache James Project. * ***********************************************************************/
package net._4mi.james.matchers; import org.apache.mailet.GenericMatcher; import org.apache.mailet.Mail; import net._4mi.james.matchers.util.URIScanner; import java.util.Collection; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; import java.util.StringTokenizer; import java.net.UnknownHostException; import java.io.IOException; import javax.mail.MessagingException; import javax.mail.internet.MimeBodyPart; import javax.mail.internet.MimeMessage; import javax.mail.internet.MimeMultipart; import javax.mail.internet.MimePart; /** * Scans the message body for URIs, then checks them against a set of * URI spammer blacklists. (See http://www.surbl.org.) * * Example: * <mailet match="InURIBlackLists=sc.surbl.org,ab.surbl.org" class="ToProcessor"> * <processor>spam</processor> * </mailet> * */ public class InURIBlacklists extends GenericMatcher { /** Set of URI black lists to use */ private final ArrayList uribls = new ArrayList(); /** * Initializes this Matcher with a list URI blacklist domains */ public void init() throws javax.mail.MessagingException { StringTokenizer st = new StringTokenizer(getCondition(), ", \t", false); while (st.hasMoreTokens()) { uribls.add(st.nextToken()); } log("uribls="+uribls); } /** * * Scans the message body for URIs, then checks them against a set of * URI blacklists. (See http://www.surbl.org.) If any hits are found, all * recipients are returned. * * @param mail the Mail object which contains a MimeMessage and routing * information * @return a Collection of recipients. If any URI hits are found, returns * all of the mail's recipients, returns null otherwise. * @throws javax.mail.MessagingException - if an exception occurs that * interferes with the mailet's normal operation occurred */ public Collection match(Mail mail) throws MessagingException { MimeMessage message = mail.getMessage(); log("doing URIBL lookup on mail w/ subject: \""+message.getSubject()+"\""); HashSet domains = new HashSet(20); try { scanMailForDomains(domains, message); } catch (IOException ioe) { throw new MessagingException("Could not read MimeMessage", ioe); } //log(stack2string(new Exception("foo"))); log("found domains: "+domains); for (Iterator i=domains.iterator(); i.hasNext();) { String domain = (String)i.next(); log("looking up: \""+domain+"\""); for (Iterator j=uribls.iterator(); j.hasNext();) { String uribl = (String)j.next(); log("using uribl: \""+uribl+"\""); String target = domain + "." + uribl; log("target: \""+target+"\""); try { org.apache.james.dnsserver.DNSServer.getByName(target); log("got a hit: \""+target+"\""); return mail.getRecipients(); } catch (UnknownHostException uhe) { // domain not found. keep processing } } } log("no spammy URIs"); return null; } /** * Recursively scans all MimeParts of an email for domain strings. Domain * strings that are found are added to the supplied HashSet. * * @param domains HashSet for accumulating domain strings * @param part MimePart to scan */ protected void scanMailForDomains(HashSet domains, MimePart part) throws MessagingException, IOException { log(" mime type is: \""+part.getContentType()+"\""); if (part.isMimeType("text/plain") || part.isMimeType("text/html")) { log(" scanning: \""+part.getContent().toString()+"\""); URIScanner.scanContentForDomains(domains, part.getContent().toString()); } else if (part.isMimeType("multipart/*")) { MimeMultipart multipart = (MimeMultipart)part.getContent(); int count = multipart.getCount(); log(" multipart count is: "+count); for (int index=0; index<count; index++) { log(" recursing index: "+index); MimeBodyPart mimeBodyPart = (MimeBodyPart)multipart.getBodyPart(index); scanMailForDomains(domains, mimeBodyPart); } } } /** static protected String stack2string(Exception e) { try { java.io.StringWriter sw = new java.io.StringWriter(); java.io.PrintWriter pw = new java.io.PrintWriter(sw); e.printStackTrace(pw); return "------" + sw.toString() + "------"; } catch(Exception e2) { return "bad stack2string"; } } **/ }
/*********************************************************************** * Copyright (c) 2004 Michael Bryant . * * All rights reserved. * * ------------------------------------------------------------------- * * Licensed under the Apache License, Version 2.0 (the "License"); you * * may not use this file except in compliance with the License. You * * may obtain a copy of the License at: * * * * http://www.apache.org/licenses/LICENSE-2.0 * * * * Unless required by applicable law or agreed to in writing, software * * distributed under the License is distributed on an "AS IS" BASIS, * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * * implied. See the License for the specific language governing * * permissions and limitations under the License. * * ------------------------------------------------------------------- * * This software contains regular expression code derived from the * * Apache Spmassassin Project. * ***********************************************************************/ package net._4mi.james.matchers.util; import java.util.HashSet; import java.util.Iterator; import java.util.StringTokenizer; import java.util.regex.*; import java.net.URI; import java.net.UnknownHostException; import java.io.IOException; public class URIScanner { /* These regular expressions "inspired" by Spamassassin */ static private final String reserved = ";/?:@&=+$,[]\\#|"; static private final String reservedNoColon = ";/?@&=+$,[]\\#|"; static private final String mark = "-_.!~*'()"; static private final String unreserved = "A-Za-z0-9" + escape(mark) + "\\x00-\\x08\\x0b\\x0c\\x0e-\\x1f"; static private final String uricSet = escape(reserved) + unreserved + "%"; static private final String uricNoColon = escape(reservedNoColon) + unreserved + "%"; static private final String schemeRE = "(?-xism:(?:https?|ftp|mailto|javascript|file))"; static private final String schemelessRE = "(?-xism:(?<![.=])(?:(?i)www\\d*\\.|(?i)ftp\\.))"; static private final String uriRE = "(?-xism:\\b(?:"+schemeRE+":["+uricNoColon+"]|"+schemelessRE+")["+uricSet+"#]*)"; /** Pre-compiled pattern that matches URIs */ static private final Pattern uriPattern = Pattern.compile(uriRE); /** Pre-compiled pattern that matches URI scheme strings */ static private final Pattern schemePattern = Pattern.compile("^"+schemeRE+":"); /** Pre-compiled pattern used to cleanup a found URI string */ static private final Pattern uriCleanup = Pattern.compile("^<(.*)>$"); /** Pre-compiled pattern used to cleanup a found URI string */ static private final Pattern uriCleanup2 = Pattern.compile("[\\]\\)>#]$"); /** Pre-compile pattern for identifying "mailto" patterns */ static private final Pattern uriCleanup3 = Pattern.compile("^(?i)mailto:([^\\/]{2})(.*)$"); /* These regular expressions also "inspired" by Spamassassin */ static private final String esc = "\\\\"; static private final String period = "\\."; static private final String space = "\\040"; static private final String open_br = "\\["; static private final String close_br = "\\]"; static private final String nonASCII = "\\x80-\\xff"; static private final String ctrl = "\\000-\\037"; static private final String cr_list = "\\n\\015"; static private final String qtext = "[^"+esc+nonASCII+cr_list+"\"]"; static private final String dtext = "[^"+esc+nonASCII+cr_list+open_br+close_br+"]"; static private final String quoted_pair = esc+"[^"+nonASCII+"]"; static private final String atom_char = "[^("+space+")<>@,;:\"."+esc+open_br+close_br+ctrl+nonASCII+"]"; static private final String atom = "(?>"+atom_char+"+)"; static private final String quoted_str = "\""+qtext+"*(?:"+quoted_pair+qtext+"*)*\""; static private final String word = "(?:"+atom+"|"+quoted_str+")"; static private final String local_part = word+"(?:"+period+word+")*"; static private final String label = "[A-Za-z\\d](?:[A-Za-z\\d-]*[A-Za-z\\d])?"; static private final String domain_ref = label+"(?:"+period+label+")*"; static private final String domain_lit = open_br+"(?:"+dtext+"|"+quoted_pair+")*"+close_br; static private final String domain = "(?:"+domain_ref+"|"+domain_lit+")"; static private final String Addr_spec_re = "(?-xism:"+local_part+"[EMAIL PROTECTED]"+domain+")"; /** Pre-compiled pattern for matching "schemeless" mailto strings */ static private final Pattern emailAddrPattern = Pattern.compile(Addr_spec_re); /** Simple reqular expression to match an octet part of an IP address */ static private final String octet = "(?:[1-2][0-9][0-9])|(?:[1-9][0-9])|(?:[0-9])"; /** Simple regular expression to match a part of a domain string in the TLDLookup cache. */ static private final String tld = "[A-Za-z0-9\\-]*"; /** Simple regular expression that matches a two-part TLD */ static private final String tld2 = tld+"\\."+tld; /** Simple regular expression that matches a three-part TLD */ static private final String tld3 = tld+"\\."+tld+"\\."+tld; /** Regular expression that matches and captures parts of a possible one-part TLD domain string */ static private final String tldCap = "("+tld+"\\.("+tld+"))$"; /** Regular expression that matches and captures parts of a possible two-part TLD domain string */ static private final String tld2Cap = "("+tld+"\\.("+tld2+"))$"; /** Regular expression that matches and captures parts of a possible three-part TLD domain string */ static private final String tld3Cap = "("+tld+"\\.("+tld3+"))$"; /** Regular expression that matches and captures parts of an IP address */ static private final String ipCap = "(("+octet+")\\.("+octet+")\\.("+octet+")\\.("+octet+"))$"; /** Pre-compiled pattern that matches IP addresses */ static private final Pattern ipCapPattern = Pattern.compile(ipCap); /** Pre-compiled pattern that matches domain string that is possibly contained in a one-part TLD */ static private final Pattern tldCapPattern = Pattern.compile(tldCap); /** Pre-compiled pattern that matches domain string that is possibly contained in a two-part TLD */ static private final Pattern tld2CapPattern = Pattern.compile(tld2Cap); /** Pre-compiled pattern that matches domain string that is possibly contained in a three-part TLD */ static private final Pattern tld3CapPattern = Pattern.compile(tld3Cap); /** controls testing/debug output */ static private boolean testing = false; /** * Scans a character sequence for URIs. Then add all unique domain strings * derived from those found URIs to the supplied HashSet. * <p> * This function calls scanContentForHosts() to grab all the host strings. * Then it calls domainFromHost() on each host string found to distill them * to their basic "registrar" domains. * * @param domains a HashSet to be populated with all domain strings found in * the content * @param content a character sequence to be scanned for URIs */ static public void scanContentForDomains(HashSet domains, CharSequence content) { HashSet hosts = scanContentForHosts(content); for (Iterator i = hosts.iterator(); i.hasNext();) { String domain = domainFromHost((String)i.next()); if (null != domain) { if (false == domains.contains(domain)) { domains.add(domain); } } } } /** * Scans a character sequence for URIs. Then returns all unique host strings * derived from those found URIs in a HashSet * * @param content a character sequence to be scanned for URIs * @return a HashSet containing host strings */ static protected HashSet scanContentForHosts(CharSequence content) { HashSet set = new HashSet(); try { // look for URIs Matcher mat = uriPattern.matcher(content); while (mat.find()) { String found = mat.group(); Matcher cleanMat = uriCleanup.matcher(found); if (cleanMat.find()) { found = cleanMat.group(1); } cleanMat = uriCleanup2.matcher(found); if (cleanMat.find()) { found = cleanMat.replaceAll(""); } cleanMat = uriCleanup3.matcher(found); if (cleanMat.find()) { found = "mailto://"+cleanMat.group(1)+cleanMat.group(2); } cleanMat = schemePattern.matcher(found); if (!cleanMat.find()) { if (found.matches("^(?i)www\\d*\\..*")) { found = "http://" + found; } else if (found.matches("^(?i)ftp\\..*")) { found = "ftp://" + found; } } String host = hostFromUriStr(found); if (null != host) { host = host.toLowerCase(); if (false == set.contains(host)) { set.add(host); } } } // look for "schemeless" email addresses, too mat = emailAddrPattern.matcher(content); while (mat.find()) { String found = mat.group(); debugOut("******** mailfound=\""+found+"\""); found = "mailto://"+found; debugOut("*******6 mailfoundfound=\""+found+"\" after cleanup 6"); String host = hostFromUriStr(found); if (null != host) { host = host.toLowerCase(); if (false == set.contains(host)) { set.add(host); } } } } catch (Exception ex) { debugOut(ex.toString()); ex.printStackTrace(); } return set; } /** * Extracts and returns the host portion of URI string. * * This function uses java.net.URI. * * @param uriStr a string containing a URI * @return the host portion of the supplied URI, null if no host string * could be found */ static protected String hostFromUriStr(String uriStr) { debugOut("hostFromUriStr(\""+uriStr+"\")"); String host = null; try { URI uri = new URI(uriStr); host = uri.getHost(); } catch (Exception ex) { } return host; } /** * Extracts and returns the registrar domain portion of a host string. This * funtion checks all known multi-part TLDs to make sure that registrar * domain is complete. For example, if the supplied host string is * "subdomain.example.co.uk", the TLD is "co.uk" and not "uk". Therefore, * the correct registrar domain is not "co.uk", but "example.co.uk". If the * domain string is an IP address, then the octets are returned in reverse * order. * * @param host a string containing a host name * @return the registrar domain portion of the supplied host string */ static protected String domainFromHost(String host) { debugOut("domainFromHost(\""+host+"\")"); String domain = null; Matcher mat; try { // IP addrs mat = ipCapPattern.matcher(host); if (mat.find()) { // reverse the octets now domain = mat.group(5)+"."+mat.group(4)+"."+mat.group(3)+"."+mat.group(2); debugOut("domain=\""+domain+"\""); return domain; } // 3-part TLDs mat = tld3CapPattern.matcher(host); if (mat.find()) { String tld = mat.group(2); if (TLDLookup.isThreePartTLD(tld)) { domain = mat.group(1); debugOut("domain=\""+domain+", tld=\""+tld+"\""); return domain; } } // 2-part TLDs mat = tld2CapPattern.matcher(host); if (mat.find()) { String tld = mat.group(2); if (TLDLookup.isTwoPartTLD(tld)) { domain = mat.group(1); debugOut("domain=\""+domain+", tld=\""+tld+"\""); return domain; } } // 1-part TLDs mat = tldCapPattern.matcher(host); if (mat.find()) { String tld = mat.group(2); domain = mat.group(1); debugOut("domain=\""+domain+", tld=\""+tld+"\""); return domain; } } catch (Exception ex) { debugOut(ex.toString()); ex.printStackTrace(); } return domain; } /** * Debugging output */ private static void debugOut(String msg) { if (true == testing) { System.out.println(msg); } } /** * Test driver */ public static void main(String args[]) { testing = true; String str = "jhl http://123.234.12.34/foo.html kh mailto:[EMAIL PROTECTED] jlksjl <http://Www.foo.org> hkjhkjhk kljhlkj www3.foobar.org kjhk wWw.foojar.org jh fTp.foot.com lhjhkj h www.foo.org"; debugOut("str=\""+str+"\""); HashSet domains = new HashSet(); scanContentForDomains(domains, str); for (Iterator i=domains.iterator(); i.hasNext();) { String domain = (String)i.next(); debugOut("domain = "+domain); } } /** * A utility function that "escapes" special characters in a string. * * @param str a string to be processed * @return modified "escaped" string */ private static String escape(String str) { StringBuffer buffer = new StringBuffer(); for (int i=0; i<str.length(); i++) { char ch = str.charAt(i); if (Character.isDigit(ch) || Character.isUpperCase(ch) || Character.isLowerCase(ch) || ch == '_') { buffer.append(ch); } else { buffer.append("\\"); buffer.append(ch); } } return buffer.toString(); } }
/*********************************************************************** * Copyright (c) 2004 Michael Bryant . * * All rights reserved. * * ------------------------------------------------------------------- * * Licensed under the Apache License, Version 2.0 (the "License"); you * * may not use this file except in compliance with the License. You * * may obtain a copy of the License at: * * * * http://www.apache.org/licenses/LICENSE-2.0 * * * * Unless required by applicable law or agreed to in writing, software * * distributed under the License is distributed on an "AS IS" BASIS, * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * * implied. See the License for the specific language governing * * permissions and limitations under the License. * ***********************************************************************/ package net._4mi.james.matchers.util; import java.util.HashSet; /** * A utility class that caches sets of multi-part top level domains (TLDs) for * quick lookup. */ public class TLDLookup { /** Simple regular expression to match strings in the cache. Note: if the collection of known mult-part TLDs change to contain characters other than these, this string must be modified. */ static private final String tld = "[A-Za-z0-9\\-]*"; /** Simple regular expression that matches a two-part TLD */ static private final String tld2 = tld+"\\."+tld; /** Simple regular expression that matches a three-part TLD */ static private final String tld3 = tld+"\\."+tld+"\\."+tld; /** Array of all known multi-level TLDs */ static private final String[] multiPartTLDs = initMultiPartTLDs(); /** A set of all known two-part TLDs */ static private final HashSet twoPartTLDs = initTwoPartTLDs(); /** A set of all known three-part TLDs */ static private final HashSet threePartTLDs = initThreePartTLDs(); /** controls testing/debug output */ static private boolean testing = false; /** * Determines if a two-part domain string (xxx.xxx) is contained in the * cache of known two-part TLDs. * * @param domain a String representing a two-part domain * @return true if the domain string is found in the cache, false otherwise */ static public boolean isTwoPartTLD(String domain) { return twoPartTLDs.contains(domain); } /** * Determines if a three-part domain string (xxx.xxx.xxx) is contained in * the cache of known three-part TLDs. * * @param domain a String representing a three-part domain * @return true if the domain string is found in the cache, false otherwise */ static public boolean isThreePartTLD(String domain) { return threePartTLDs.contains(domain); } /** * Initialize two-part top-level domain cache. * * @return a HashSet containing all known two-part TLDs */ static private HashSet initTwoPartTLDs() { HashSet set = new HashSet(900); for (int i=0; i<multiPartTLDs.length; i++) { try { if (multiPartTLDs[i].matches("^"+tld2+"$")) { set.add(multiPartTLDs[i]); } } catch (Exception ex) { debugOut(ex); } } debugOut("initTwoPartTLDs size="+set.size()); return set; } /** * Initialize three-part top-level domain cache. * * @return a HashSet containing all known three-part TLDs */ static private HashSet initThreePartTLDs() { HashSet set = new HashSet(); for (int i=0; i<multiPartTLDs.length; i++) { try { if (multiPartTLDs[i].matches("^"+tld3+"$")) { debugOut("adding \"" + multiPartTLDs[i] + "\""); set.add(multiPartTLDs[i]); } } catch (Exception ex) { debugOut(ex); } } debugOut("initThreePartTLDs size="+set.size()); return set; } /** * Initialize an array of Strings containing all known multi-part TLDs * * @return an array of all known multi-part TLDs */ static private String[] initMultiPartTLDs() { String[] tmp = new String[] { "com.ac", "edu.ac", "gov.ac", "edu.ai", "gov.ai", "com.ar", "net.ar", "org.ar", "gov.ar", "mil.ar", "edu.ar", "int.ar", "co.at", "ac.at", "or.at", "gv.at", "priv.at", "com.au", "gov.au", "org.au", "edu.au", "id.au", "oz.au", "info.au", "net.au", "asn.au", "csiro.au", "telememo.au", "conf.au", "otc.au", "com.az", "net.az", "org.az", "com.bb", "net.bb", "org.bb", "ac.be", "belgie.be", "dns.be", "fgov.be", "com.bh", "gov.bh", "net.bh", "edu.bh", "org.bh", "com.bm", "edu.bm", "gov.bm", "org.bm", "net.bm", "adm.br", "adv.br", "agr.br", "am.br", "arq.br", "art.br", "ato.br", "bio.br", "bmd.br", "cim.br", "cng.br", "cnt.br", "com.br", "coop.br", "ecn.br", "edu.br", "eng.br", "esp.br", "etc.br", "eti.br", "far.br", "fm.br", "fnd.br", "fot.br", "fst.br", "g12.br", "ggf.br", "gov.br", "imb.br", "ind.br", "inf.br", "jor.br", "lel.br", "mat.br", "med.br", "mil.br", "mus.br", "net.br", "nom.br", "not.br", "ntr.br", "odo.br", "org.br", "ppg.br", "pro.br", "psc.br", "psi.br", "qsl.br", "rec.br", "slg.br", "srv.br", "tmp.br", "trd.br", "tur.br", "tv.br", "vet.br", "zlg.br", "com.bs", "net.bs", "org.bs", "ab.ca", "bc.ca", "mb.ca", "nb.ca", "nf.ca", "nl.ca", "ns.ca", "nt.ca", "nu.ca", "on.ca", "pe.ca", "qc.ca", "sk.ca", "yk.ca", "co.ck", "net.ck", "org.ck", "edu.ck", "gov.ck", "com.cn", "edu.cn", "gov.cn", "net.cn", "org.cn", "ac.cn", "ah.cn", "bj.cn", "cq.cn", "gd.cn", "gs.cn", "gx.cn", "gz.cn", "hb.cn", "he.cn", "hi.cn", "hk.cn", "hl.cn", "hn.cn", "jl.cn", "js.cn", "ln.cn", "mo.cn", "nm.cn", "nx.cn", "qh.cn", "sc.cn", "sn.cn", "sh.cn", "sx.cn", "tj.cn", "tw.cn", "xj.cn", "xz.cn", "yn.cn", "zj.cn", "arts.co", "com.co", "edu.co", "firm.co", "gov.co", "info.co", "int.co", "nom.co", "mil.co", "org.co", "rec.co", "store.co", "web.co", "ac.cr", "co.cr", "ed.cr", "fi.cr", "go.cr", "or.cr", "sa.cr", "com.cu", "net.cu", "org.cu", "ac.cy", "com.cy", "gov.cy", "net.cy", "org.cy", "co.dk", "art.do", "com.do", "edu.do", "gov.do", "org.do", "mil.do", "net.do", "web.do", "com.dz", "org.dz", "net.dz", "gov.dz", "edu.dz", "ass.dz", "pol.dz", "art.dz", "com.ec", "k12.ec", "edu.ec", "fin.ec", "med.ec", "gov.ec", "mil.ec", "org.ec", "net.ec", "com.eg", "edu.eg", "eun.eg", "gov.eg", "net.eg", "org.eg", "sci.eg", "com.er", "net.er", "org.er", "edu.er", "mil.er", "gov.er", "ind.er", "com.et", "gov.et", "org.et", "edu.et", "net.et", "biz.et", "name.et", "info.et", "ac.fj", "com.fj", "gov.fj", "id.fj", "org.fj", "school.fj", "com.fk", "ac.fk", "gov.fk", "net.fk", "nom.fk", "org.fk", "asso.fr", "nom.fr", "barreau.fr", "com.fr", "prd.fr", "presse.fr", "tm.fr", "aeroport.fr", "assedic.fr", "avocat.fr", "avoues.fr", "cci.fr", "chambagri.fr", "chirurgiens-dentistes.fr", "experts-comptables.fr", "geometre-expert.fr", "gouv.fr", "greta.fr", "huissier-justice.fr", "medecin.fr", "notaires.fr", "pharmacien.fr", "port.fr", "veterinaire.fr", "com.ge", "edu.ge", "gov.ge", "mil.ge", "net.ge", "org.ge", "pvt.ge", "co.gg", "org.gg", "sch.gg", "ac.gg", "gov.gg", "ltd.gg", "ind.gg", "net.gg", "alderney.gg", "guernsey.gg", "sark.gg", "com.gu", "edu.gu", "net.gu", "org.gu", "gov.gu", "mil.gu", "com.hk", "net.hk", "org.hk", "idv.hk", "gov.hk", "edu.hk", "co.hu", "2000.hu", "erotika.hu", "jogasz.hu", "sex.hu", "video.hu", "info.hu", "agrar.hu", "film.hu", "konyvelo.hu", "shop.hu", "org.hu", "bolt.hu", "forum.hu", "lakas.hu", "suli.hu", "priv.hu", "casino.hu", "games.hu", "media.hu", "szex.hu", "sport.hu", "city.hu", "hotel.hu", "news.hu", "tozsde.hu", "tm.hu", "erotica.hu", "ingatlan.hu", "reklam.hu", "utazas.hu", "ac.id", "co.id", "go.id", "mil.id", "net.id", "or.id", "co.il", "net.il", "org.il", "ac.il", "gov.il", "k12.il", "muni.il", "idf.il", "co.im", "net.im", "org.im", "ac.im", "lkd.co.im", "gov.im", "nic.im", "plc.co.im", "co.in", "net.in", "ac.in", "ernet.in", "gov.in", "nic.in", "res.in", "gen.in", "firm.in", "mil.in", "org.in", "ind.in", "ac.je", "co.je", "net.je", "org.je", "gov.je", "ind.je", "jersey.je", "ltd.je", "sch.je", "com.jo", "org.jo", "net.jo", "gov.jo", "edu.jo", "mil.jo", "ad.jp", "ac.jp", "co.jp", "go.jp", "or.jp", "ne.jp", "gr.jp", "ed.jp", "lg.jp", "net.jp", "org.jp", "gov.jp", "hokkaido.jp", "aomori.jp", "iwate.jp", "miyagi.jp", "akita.jp", "yamagata.jp", "fukushima.jp", "ibaraki.jp", "tochigi.jp", "gunma.jp", "saitama.jp", "chiba.jp", "tokyo.jp", "kanagawa.jp", "niigata.jp", "toyama.jp", "ishikawa.jp", "fukui.jp", "yamanashi.jp", "nagano.jp", "gifu.jp", "shizuoka.jp", "aichi.jp", "mie.jp", "shiga.jp", "kyoto.jp", "osaka.jp", "hyogo.jp", "nara.jp", "wakayama.jp", "tottori.jp", "shimane.jp", "okayama.jp", "hiroshima.jp", "yamaguchi.jp", "tokushima.jp", "kagawa.jp", "ehime.jp", "kochi.jp", "fukuoka.jp", "saga.jp", "nagasaki.jp", "kumamoto.jp", "oita.jp", "miyazaki.jp", "kagoshima.jp", "okinawa.jp", "sapporo.jp", "sendai.jp", "yokohama.jp", "kawasaki.jp", "nagoya.jp", "kobe.jp", "kitakyushu.jp", "utsunomiya.jp", "kanazawa.jp", "takamatsu.jp", "matsuyama.jp", "com.kh", "net.kh", "org.kh", "per.kh", "edu.kh", "gov.kh", "mil.kh", "ac.kr", "co.kr", "go.kr", "ne.kr", "or.kr", "pe.kr", "re.kr", "seoul.kr", "kyonggi.kr", "com.kw", "net.kw", "org.kw", "edu.kw", "gov.kw", "com.la", "net.la", "org.la", "com.lb", "org.lb", "net.lb", "edu.lb", "gov.lb", "mil.lb", "com.lc", "edu.lc", "gov.lc", "net.lc", "org.lc", "com.lv", "net.lv", "org.lv", "edu.lv", "gov.lv", "mil.lv", "id.lv", "asn.lv", "conf.lv", "com.ly", "net.ly", "org.ly", "co.ma", "net.ma", "org.ma", "press.ma", "ac.ma", "com.mk", "com.mm", "net.mm", "org.mm", "edu.mm", "gov.mm", "com.mo", "net.mo", "org.mo", "edu.mo", "gov.mo", "com.mt", "net.mt", "org.mt", "edu.mt", "tm.mt", "uu.mt", "com.mx", "net.mx", "org.mx", "com.my", "org.my", "gov.my", "edu.my", "net.my", "com.na", "org.na", "net.na", "alt.na", "edu.na", "cul.na", "unam.na", "telecom.na", "com.nc", "net.nc", "org.nc", "ac.ng", "edu.ng", "sch.ng", "com.ng", "gov.ng", "org.ng", "net.ng", "gob.ni", "com.ni", "net.ni", "edu.ni", "nom.ni", "org.ni", "com.np", "net.np", "org.np", "gov.np", "edu.np", "ac.nz", "co.nz", "cri.nz", "gen.nz", "geek.nz", "govt.nz", "iwi.nz", "maori.nz", "mil.nz", "net.nz", "org.nz", "school.nz", "com.om", "co.om", "edu.om", "ac.om", "gov.om", "net.om", "org.om", "mod.om", "museum.om", "biz.om", "pro.om", "med.om", "com.pa", "net.pa", "org.pa", "edu.pa", "ac.pa", "gob.pa", "sld.pa", "edu.pe", "gob.pe", "nom.pe", "mil.pe", "org.pe", "com.pe", "net.pe", "com.pg", "net.pg", "ac.pg", "com.ph", "net.ph", "org.ph", "mil.ph", "ngo.ph", "aid.pl", "agro.pl", "atm.pl", "auto.pl", "biz.pl", "com.pl", "edu.pl", "gmina.pl", "gsm.pl", "info.pl", "mail.pl", "miasta.pl", "media.pl", "mil.pl", "net.pl", "nieruchomosci.pl", "nom.pl", "org.pl", "pc.pl", "powiat.pl", "priv.pl", "realestate.pl", "rel.pl", "sex.pl", "shop.pl", "sklep.pl", "sos.pl", "szkola.pl", "targi.pl", "tm.pl", "tourism.pl", "travel.pl", "turystyka.pl", "com.pk", "net.pk", "edu.pk", "org.pk", "fam.pk", "biz.pk", "web.pk", "gov.pk", "gob.pk", "gok.pk", "gon.pk", "gop.pk", "gos.pk", "edu.ps", "gov.ps", "plo.ps", "sec.ps", "com.py", "net.py", "org.py", "edu.py", "com.qa", "net.qa", "org.qa", "edu.qa", "gov.qa", "asso.re", "com.re", "nom.re", "com.ru", "net.ru", "org.ru", "pp.ru", "com.sa", "edu.sa", "sch.sa", "med.sa", "gov.sa", "net.sa", "org.sa", "pub.sa", "com.sb", "net.sb", "org.sb", "edu.sb", "gov.sb", "com.sd", "net.sd", "org.sd", "edu.sd", "sch.sd", "med.sd", "gov.sd", "tm.se", "press.se", "parti.se", "brand.se", "fh.se", "fhsk.se", "fhv.se", "komforb.se", "kommunalforbund.se", "komvux.se", "lanarb.se", "lanbib.se", "naturbruksgymn.se", "sshn.se", "org.se", "pp.se", "com.sg", "net.sg", "org.sg", "edu.sg", "gov.sg", "per.sg", "com.sh", "net.sh", "org.sh", "edu.sh", "gov.sh", "mil.sh", "gov.st", "saotome.st", "principe.st", "consulado.st", "embaixada.st", "org.st", "edu.st", "net.st", "com.st", "store.st", "mil.st", "co.st", "com.sv", "org.sv", "edu.sv", "gob.sv", "red.sv", "com.sy", "net.sy", "org.sy", "gov.sy", "ac.th", "co.th", "go.th", "net.th", "or.th", "com.tn", "net.tn", "org.tn", "edunet.tn", "gov.tn", "ens.tn", "fin.tn", "nat.tn", "ind.tn", "info.tn", "intl.tn", "rnrt.tn", "rnu.tn", "rns.tn", "tourism.tn", "com.tr", "net.tr", "org.tr", "edu.tr", "gov.tr", "mil.tr", "bbs.tr", "k12.tr", "gen.tr", "co.tt", "com.tt", "org.tt", "net.tt", "biz.tt", "info.tt", "pro.tt", "name.tt", "gov.tt", "edu.tt", "nic.tt", "us.tt", "uk.tt", "ca.tt", "eu.tt", "es.tt", "fr.tt", "it.tt", "se.tt", "dk.tt", "be.tt", "de.tt", "at.tt", "au.tt", "co.tv", "com.tw", "net.tw", "org.tw", "edu.tw", "idv.tw", "gove.tw", "com.ua", "net.ua", "org.ua", "edu.ua", "gov.ua", "ac.ug", "co.ug", "or.ug", "go.ug", "co.uk", "me.uk", "org.uk", "edu.uk", "ltd.uk", "plc.uk", "net.uk", "sch.uk", "nic.uk", "ac.uk", "gov.uk", "nhs.uk", "police.uk", "mod.uk", "dni.us", "fed.us", "com.uy", "edu.uy", "net.uy", "org.uy", "gub.uy", "mil.uy", "com.ve", "net.ve", "org.ve", "co.ve", "edu.ve", "gov.ve", "mil.ve", "arts.ve", "bib.ve", "firm.ve", "info.ve", "int.ve", "nom.ve", "rec.ve", "store.ve", "tec.ve", "web.ve", "co.vi", "net.vi", "org.vi", "com.vn", "biz.vn", "edu.vn", "gov.vn", "net.vn", "org.vn", "int.vn", "ac.vn", "pro.vn", "info.vn", "health.vn", "name.vn", "com.vu", "edu.vu", "net.vu", "org.vu", "de.vu", "ch.vu", "fr.vu", "com.ws", "net.ws", "org.ws", "gov.ws", "edu.ws", "ac.yu", "co.yu", "edu.yu", "org.yu", "com.ye", "net.ye", "org.ye", "gov.ye", "edu.ye", "mil.ye", "ac.za", "alt.za", "bourse.za", "city.za", "co.za", "edu.za", "gov.za", "law.za", "mil.za", "net.za", "ngo.za", "nom.za", "org.za", "school.za", "tm.za", "web.za", "co.zw", "ac.zw", "org.zw", "gov.zw", "eu.org", "au.com", "br.com", "cn.com", "de.com", "de.net", "eu.com", "gb.com", "gb.net", "hu.com", "no.com", "qc.com", "ru.com", "sa.com", "se.com", "uk.com", "uk.net", "us.com", "uy.com", "za.com", "dk.org", "tel.no", "fax.nr", "mob.nr", "mobil.nr", "mobile.nr", "tel.nr", "tlf.nr", "e164.arpa" }; debugOut("array size=" + tmp.length); return tmp; } /** * Debugging output */ private static void debugOut(String msg) { if (true == testing) { System.out.println(msg); } } /** * Debugging output */ private static void debugOut(Throwable th) { if (true == testing) { System.out.println(th); } } /** * Test driver */ public static void main(String args[]) { testing = true; String[] test2 = new String[] { "woof.com", "co.uk", "lkd.co.im", "gov.qa", }; String[] test3 = new String[] { "woof.woof.com", "lkd.co.im", "gov.qa", }; int i; debugOut("2 part TLDs --------"); for (i=0; i<test2.length; i++) { debugOut(test2[i]+" found is: "+isTwoPartTLD(test2[i])); } debugOut("3 part TLDs --------"); for (i=0; i<test3.length; i++) { debugOut(test3[i]+" found is: "+isThreePartTLD(test3[i])); } } }
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]
