[Sorry, I am resending this since the list seems to have issues with
attachments that are not plain text files. You will have to remove the
first line of the two Perl files, used to trick my mail program into
sending these as plain text.]
On Thu, 2007-04-05 at 18:42 -0400, Tolkin, Steve wrote:
> I am looking for a program that can recover the original text from
> text that has spaces inserted or deleted.
> Ideally in perl of course.
I tossed together a script to do that several years ago for puzzle
solving, but of course I never had time to finish it. As a result, it
is badly written, incomplete, and in need of revision, but it does a
decent job.
Attached is a Perl module with the important routines included, a test
script showing how to use it, and the output of the test script on my
system. (You'll need to repoint the test script at a dictionary file.)
If anyone makes this better, I would love to hear about it.
Enjoy.
+ Richard
# Delete this first line
#!/usr/bin/perl
# Methods to determine if a message is plaintext
package Plaintext;
use strict;
use warnings;
# For exporting methods and variables
use base qw( Exporter );
use vars qw( @EXPORT @EXPORT_OK %EXPORT_TAGS %normal_frequency );
@EXPORT = ();
@EXPORT_OK = qw(
%normal_frequency
char_frequency
letter_frequency
compare_frequencies
print_frequency
is_plaintext
load_wordlist
reverse_load_wordlist
load_wordlists
lookup_word
add_spaces
reverse_add_spaces
);
%EXPORT_TAGS = (all => [EMAIL PROTECTED]);
# Normal frequency of English letters as percentage
# Source is http://storm.prohosting.com/~glyph/crypto/freq-en.shtml
%normal_frequency = (
a => 8.167,
b => 1.492,
c => 2.782,
d => 4.253,
e => 12.702,
f => 2.228,
g => 2.015,
h => 6.094,
i => 6.966,
j => 0.153,
k => 0.772,
l => 4.025,
m => 2.406,
n => 6.749,
o => 7.507,
p => 1.929,
q => 0.095,
r => 5.987,
s => 6.327,
t => 9.056,
u => 2.758,
v => 0.978,
w => 2.360,
x => 0.150,
y => 1.974,
z => 0.075,
);
sub char_frequency {
# $frequency = char_frequency($message);
# Returns a reference to a hash containing a mapping between every
# character in a given message and its frequency as a percentage
my ($message) = @_;
my %frequency = ();
# Get characters in message
my @chars = split(//, $message);
foreach my $char (@chars) {
# Count characters
$frequency{$char}++;
}
# Denominator for frequency conversion
my $rate = (@chars / 100);
foreach my $char (keys %frequency) {
# Convert counts to frequency in percentage
$frequency{$char} /= $rate;
}
return \%frequency;
}
sub letter_frequency {
# $frequency = letter_frequency($message);
# Returns a reference to a hash containing a mapping between every
# letter in a given message and its frequency as a percentage. All
# letters are made lower case and other characters are not considered.
my ($message) = @_;
# Downcase characters
$message =~ tr(A-Z)(a-z);
# Remove all non-letter characters
$message = join('', split(/[^a-z]/, $message));
# Obtain letter freqencies
my $frequency = char_frequency($message);
foreach my $letter ("a".."z") {
# Ensure each letter has a frequency
$frequency->{$letter}= 0 unless exists $frequency->{$letter};
}
return $frequency;
}
sub print_frequency {
# $frequency = letter_frequency($message);
# Given a frequency hash reference, prints a tab-separated table of
# letters and their frequencies.
my ($frequency) = @_;
foreach my $char (sort keys %$frequency) {
print "$char\t" . sprintf("%2.3f", $frequency->{$char}) . "\n";
}
return;
}
sub compare_frequencies {
# $chi_squared = compare_frequencies($frequency1, $frequency2);
# Given two frequency hash references, calculates and returns the
# chi squared value comparing the two.
my ($frequency1, $frequency2) = @_;
# Sorted unique keys in frequency hashes
my @chars = sort keys %{{%$frequency1, %$frequency2}};
my $chi_squared = 0;
foreach my $char (@chars) {
$chi_squared += (($frequency1->{$char} -
$frequency2->{$char}))**2/$frequency2->{$char};
}
return $chi_squared;
}
sub is_plaintext {
# $boolean = is_plaintext($message)
# Returns true if the message is probably plain text.
# Compares the message's letter freqency with normal letter freqency,
# and returns true if the chi square value suggests 95% likelihood?
my ($message) = @_;
my $frequency = letter_frequency($message);
my $chi_squared = compare_frequencies($frequency, \%normal_frequency);
return ($chi_squared < 37.65);
}
sub load_wordlist {
# $wordlist = load_wordlist($filename);
# Load a wordlist from a file, returning the wordlist as a hash reference.
my ($filename) = @_;
my %wordlist;
open FILE, "<$filename"
or die "File $filename not found.\n";
while (my $word = lc(<FILE>)) {
chomp($word);
$wordlist{$word}++;
}
close FILE;
return \%wordlist;
}
sub reverse_load_wordlist {
# $wordlist = load_wordlist($filename);
# Load a reversed wordlist from a file, returning the wordlist as a hash ref.
my ($filename) = @_;
my %wordlist;
open FILE, "<$filename"
or die "File $filename not found.\n";
while (my $word = lc(<FILE>)) {
chomp($word);
$word = reverse($word);
$wordlist{$word}++;
}
close FILE;
return \%wordlist;
}
sub load_wordlists { ### XXX Delete two above
# ($wordlist, $reversed_wordlist) = load_wordlists($filename);
# Load wordlists from a file, returning the wordlists as hash references.
my ($filename) = @_;
my (%wordlist, %reverse_wordlist);
open FILE, "<$filename"
or die "File $filename not found.\n";
while (my $word = lc(<FILE>)) {
chomp($word);
$wordlist{$word}++;
$word = reverse($word);
$reverse_wordlist{$word}++;
}
close FILE;
return(\%wordlist, \%reverse_wordlist);
}
sub lookup_word {
# $boolean = lookup_word($word, $wordlist);
# Returns true if the word is in the wordlist.
my ($word, $wordlist) = @_;
return exists $wordlist->{$word};
}
sub add_spaces {
my ($message, $wordlist) = @_;
my $new_message = '';
my $max_length = 22;
my $words; # XXX Counting fragments
my $fragment; # XXX Counting fragments
LOOP:
while ($message) {
my $length = $max_length;
while ($length>1) {
my $string = lc(substr($message, 0, $length));
if (lookup_word($string, $wordlist)) {
$new_message .= substr($message, 0, $length, '') . " ";
$words++;# XXX Counting fragments
next LOOP;
} else {
$length--;
}
}
$new_message .= "<".substr($message, 0, 1, '').">";# XXX Counting
fragments
$fragment++;# XXX Counting fragments
# XXX NEXT PROCESS
# XXX 0a if s, add to previous
# XXX 0b if a or i, leave as separate
# XXX 1 add next word to fragments
# XXX 2 solve backward
# XXX 3 add previous word as needed
# XXX 4 go to 2
}
print "Words: $words\tFragmented: " . $fragment/length($new_message) .
"\n";# XXX Counting fragments
return $new_message;
}
sub reverse_add_spaces {
my ($message, $wordlist) = @_;
$message = reverse($message);
return reverse(add_spaces($message, $wordlist));
}
1;
__END__
# POD is below.
=head1 NAME
Plaintext
=head1 SUMMARY
=head1 SYNOPSIS
=head1 DESCRIPTION
=head1 SEE ALSO
perl
=head1 VERSION
This is Plaintext version 0.01, 14 January 2004.
=head1 AUTHOR
Richard J. Barbalace E<lt>[EMAIL PROTECTED]<gt>
=head1 COPYRIGHT
Copyright 2004 Richard J. Barbalace. All Rights Reserved.
This program is free software. You may copy or redistribute it under
the same terms as Perl itself.
# Delete this first line
#!/usr/bin/perl
use strict;
use warnings;
use lib ".";
use Data::Dumper;
use Plaintext qw(:all);
my $message = <<EOF;
Saying "the desire to explore and understand is part of our character,"
President Bush Wednesday unveiled an ambitious plan to return Americans to the
moon by 2020 and use the mission as a steppingstone for future manned trips to
Mars and beyond.
"We do not know where this journey will end, yet we know this -- human beings
are headed into the cosmos," Bush said. "Mankind is drawn to the heavens for
the same reason we were once drawn into unknown lands and across the open sea.
We choose to explore space because doing so improves our lives and lifts our
national spirit."
The president unveiled what he billed as a "new course" for the nation's space
program in a speech at NASA headquarters, shifting the long-term focus from the
space shuttle and the international space station to the creation of a new
manned space vehicle that will be flying with a crew in 10 years and will
return humans to the moon within 16 years.
EOF
my $freq = letter_frequency($message);
print_frequency($freq);
print "Chi: " . compare_frequencies($freq, \%normal_frequency) . "\n";
print "Message is plaintext.\n" if is_plaintext($message);
print "Loading wordlist...";
#my $wordlist = load_wordlist("web2.txt");
#my $revwordlist = reverse_load_wordlist("web2.txt");
my ($wordlist, $revwordlist) = load_wordlists("web2.txt");
print "done.\n";
# Downcase characters
$message =~ tr(A-Z)(a-z);
# Remove all non-letter characters
$message = join('', split(/[^a-z]/, $message));
print "Old message:\n$message\n";
my $new = add_spaces($message, $wordlist);
print "New message:\n$new\n";
my $rev = reverse_add_spaces($message, $revwordlist);
print "Newer message:\n$rev\n";
exit;
a 8.880
b 1.230
c 2.596
d 4.235
e 12.705
f 1.503
g 1.093
h 4.781
i 6.421
j 0.137
k 0.546
l 3.005
m 2.322
n 9.699
o 7.514
p 2.596
q 0.137
r 6.148
s 7.923
t 8.197
u 3.142
v 0.820
w 2.869
x 0.273
y 1.230
z 0.000
Chi: 4.10619937008306
Message is plaintext.
Loading wordlist...done.
Old message:
sayingthedesiretoexploreandunderstandispartofourcharacterpresidentbushwednesdayunveiledanambitiousplantoreturnamericanstothemoonbyandusethemissionasasteppingstoneforfuturemannedtripstomarsandbeyondwedonotknowwherethisjourneywillendyetweknowthishumanbeingsareheadedintothecosmosbushsaidmankindisdrawntotheheavensforthesamereasonwewereoncedrawnintounknownlandsandacrosstheopenseawechoosetoexplorespacebecausedoingsoimprovesourlivesandliftsournationalspiritthepresidentunveiledwhathebilledasanewcourseforthenationsspaceprograminaspeechatnasaheadquartersshiftingthelongtermfocusfromthespaceshuttleandtheinternationalspacestationtothecreationofanewmannedspacevehiclethatwillbeflyingwithacrewinyearsandwillreturnhumanstothemoonwithinyears
Words: 159 Fragmented: 0.0162866449511401
New message:
saying the desire toe <x><p>lore and understand is parto four character
president bush wednesday unveiled anam biti <o>us plant ore turn american stot
hem <o>on by and use themis sion asa steppingstone for future man ned trip
stoma <r>sand beyond wed on <o><t>know where this journey will end yet we know
this human being sare headed into the cosmos bush said mankind is drawn tot
hehe avens forth es ame reason we were once drawn into unknown land sand across
the open sea we choose toe <x><p>lore space because doings <o>improve sour live
sand lift sour national spirit the president unveiled what he billed asa new
course forth enation <s>space program in asp <e><e>chat na sah ea <d>quarters
shifting the long term focus from the space shuttle and the international space
station tot he creation of anew man ned space vehicle that will be flying with
acre winy ear sand will return human stot hem <o>on within year s
Words: 166 Fragmented: 0.019271948608137
Newer message:
saying the desire to ex>p<>l< or ean dun>d< ers tan dis parto four character
president bush wednesday unveil>e< dan ambitious plan to return american>s< to
the moon by and>u< seth emission asa stepping>s< to nef orf utu reman ned
trip>s< to mar sand beyond>w< edo not know where this journey wi>l< lend yet we
know this human being sare headed into the cosmos bush said mankin dis drawn to
the heavens for the same reason we were once drawn into unknown land sand
across the>o<>p< ense awe choose to ex>p< lo respace because doing so improve
sour live sand lift sour national spirit the president unveiled wha the bill>e<
das anew course for then ati ons space prog>r< amin as pee chat>n< asa
headquarters shifting the long term focus from the space shuttle and the
international spa cest ati onto the creation of anew man ned space vehicle tha
twill be flying wi tha>c< rewin year sand will return hu mans to the moon
within year>s<
_______________________________________________
Boston-pm mailing list
[email protected]
http://mail.pm.org/mailman/listinfo/boston-pm