Hello JC,

Friday, March 5, 2004, 6:38:52 AM, you wrote:

J> That'd be great. Thanks. :)

J>      > J> Anybody know of a rule for the long strings of random
J>      > words that don't
J>      > J> contain words like 'the, to, a, an, then, and' and 
J>      > those sort of words? I'd
J>      > J> try to write one, but my REGEX skills are inexistent.

RM> I've got several that have been posted to the list, seem to do well.
RM> I'm on the road today, but I'll post them again when I get home.

# longwords -- possible sign of random words placed into spam to confuse 
anti-spam software

body     RM_bpt_longwords68a /\b(?:[a-z]{6,}\s+){8}/
describe RM_bpt_longwords68a Long string of long words
score    RM_bpt_longwords68a 1.500  # type=FP - 7429s/2h of 91714 corpus 
(74113s/17601h) 01/23/04
                                    # ham: userid list, 
                                    # "improving compatibility between computer 
platforms demands certain levels "
body     RM_bpt_longwords69a /\b(?:[a-z]{6,}\s+){9}/
describe RM_bpt_longwords69a Long string of long words
score    RM_bpt_longwords69a 1.000  # type=max:1 (add to 59a,68a) - 6595s/1h of 
91714 corpus (74113s/17601h) 01/23/04
                                    # ham: userid list
body     RM_bpt_longwords78a /\b(?:[a-z]{7,}\s+){8}/
describe RM_bpt_longwords78a Long string of long words
score    RM_bpt_longwords78a 2.000 # type=max:2 (add to 68a) - 4163s/0h of 
91714 corpus (74113s/17601h) 01/23/04
body     RM_bpt_longwords59a /\b(?:[a-z]{5,}\s+){9}/
describe RM_bpt_longwords59a Long string of long words
score    RM_bpt_longwords59a 1.500  # type=FP - 8753s/8h of 91714 corpus 
(74113s/17601h) 01/23/04
                                    # ham: userid list
body     RM_bpt_longwords79a /\b(?:[a-z]{7,}\s+){9}/
describe RM_bpt_longwords79a Long string of long words
score    RM_bpt_longwords79a 1.000  # type=max:1 (add to 78a) - 2950s/0h of 
91714 corpus (74113s/17601h) 01/23/04
body     RM_bpt_longwords96a /\b(?:[a-z]{9,}\s+){6}/
describe RM_bpt_longwords96a Long string of long words
score    RM_bpt_longwords96a 4.000  # 1162s/0h of 91714 corpus (74113s/17601h) 
01/23/04
body     RM_bpt_longwords88a /\b(?:[a-z]{8,}\s+){8}/
describe RM_bpt_longwords88a Long string of long words
score    RM_bpt_longwords88a 4.000  # 1025s/0h of 91714 corpus (74113s/17601h) 
01/23/04
body     RM_bpt_longwords89a /\b(?:[a-z]{8,}\s+){9}/
describe RM_bpt_longwords89a Long string of long words
score    RM_bpt_longwords89a 1.000  # type=max:1 (add to 88a) - 590s/0h of 
91714 corpus (74113s/17601h) 01/23/04
body     RM_bpt_longwords97 /\b(?:\w{9,}\s+){7}/
describe RM_bpt_longwords97 Long string of long words
score    RM_bpt_longwords97 3.000  # 545s/0h of 91714 corpus (74113s/17601h) 
01/23/04
body     RM_bpt_longwords98 /\b(?:\w{9,}\s+){8}/
describe RM_bpt_longwords98 Long string of long words
score    RM_bpt_longwords98 1.000  # type=max:1 (add to 97) - 442s/0h of 91714 
corpus (74113s/17601h) 01/23/04
body     RM_bpt_longwords99 /\b(?:\w{9,}\s+){9}/
describe RM_bpt_longwords99 Long string of long words
score    RM_bpt_longwords99 1.000  # type=max:1 (add to 98) - 330s/0h of 91714 
corpus (74113s/17601h) 01/23/04

# Second pattern -- really long words, 20+ characters in length, possibly
# separated by commas or periods. Avoid common section separators and
# common/valid encoding lengths
body     RM_bpt_longwords20  /(?! _+ )(?! A+ )(?! \w{24} ) \w{20,29}[,.]? /
describe RM_bpt_longwords20  One long string of letters/digits, possible comma 
or period at end, space between "words"
score    RM_bpt_longwords20  1.939  # 10992s/116h of 100689 corpus 
(81249s/19440h) 02/29/04
body     RM_bpt_longwords30  /(?! _+ )(?! A+ )(?! \w{32} ) \w{30,39}[,.]? /
describe RM_bpt_longwords30  One long string of letters/digits, possible comma 
or period at end, space between "words"
score    RM_bpt_longwords30  1.630  # 567s/8h of 100689 corpus (81249s/19440h) 
02/29/04
body     RM_bpt_longwords40  /(?! _+ )(?! A+ ) \w{40,49}[,.]? /
describe RM_bpt_longwords40  One long string of letters/digits, possible comma 
or period at end, space between "words"
score    RM_bpt_longwords40  3.000  # 209s/0h of 100689 corpus (81249s/19440h) 
02/29/04
body     RM_bpt_longwords50  /(?! _+ )(?! A+ ) \w{50,59}[,.]? /
describe RM_bpt_longwords50  One long string of letters/digits, possible comma 
or period at end, space between "words"
score    RM_bpt_longwords50  1.860  # 86s/0h of 100689 corpus (81249s/19440h) 
02/29/04
body     RM_bpt_longwords60  /(?! _+ )(?! A+ )(?! \w{62,64} ) \w{61,69}[,.]? /
describe RM_bpt_longwords60  One long string of letters/digits, possible comma 
or period at end, space between "words"
score    RM_bpt_longwords60  2.410  # 141s/0h of 100689 corpus (81249s/19440h) 
02/29/04
body     RM_bpt_longwords70  /(?! _+ )(?! A+ )(?! \w{72} )(?! \w{76} ) 
\w{70,}[,.]? /
describe RM_bpt_longwords70  One long string of letters/digits, possible comma 
or period at end, space between "words"
score    RM_bpt_longwords70  2.400  # 280s/1h of 100689 corpus (81249s/19440h) 
02/29/04



Reply via email to