#!/usr/bin/python
# -*- coding: utf-8 -*-
"""Encode and decode a representation using a reduced-charset version of ASCII.
My theory is that you can make text more compressible by encoding it
in a modeful way, like the old Baudot codes. I’m appropriating four
ASCII codes for the role of the old FIGS:
- DC1 \x11 ^Q: change state to lowercase letters (the initial state)
- DC2 \x12 ^R: change state to uppercase letters
- DC3 \x13 ^S: change state to numbers
- DC4 \x14 ^T: treat the next character only as an uppercase letter
I know SI and SO would be more appropriate here, but I need three
states, not two.
ESC followed by any of the above four, or itself, simply passes that
character through to the output.
All three of uppercase, letters, and numbers are encoded as lowercase
letters. Effectively, this reduces the 128-character ASCII character
set by 36 to 92 characters.
The algorithm for using DC4 (capitalize next letter) is potentially
problematic, in that it may use a lot of buffer space.
This approach *does* make my test file (Project Gutenberg’s King James
Bible, minus CR characters but retaining LFs) more compressible: it
compresses to 1369889 bytes instead of 1371072 bytes with this
preprocessing applied --- indicating that the preprocessing picked up
some structure gzip wasn’t able to find itself! But this is a very
small difference, like 0.08%. The encoding bloated the original file
from 4351847 bytes to 4515061 bytes, a 3.8% increase. A random
translation from a 128-character charset to a 92-character charset
would be expected to increase the file size by 7.3%, not 3.8%.
Doing a simple huffman compression on the same file produced 19939830
bits on the raw file and 20094861 bits on the encoded file, both
exclusive of the huffman table itself, an 0.8% increase in the size of
the file, rather than gzip’s 0.08% improvement. This was a little
disappointing.
However, it achieved its purpose of formulating a practical huffman
code for manual encoding of English text with spacing, punctuation,
and formatting, with only 38 code points and reasonable efficiency of
about 4.6 bits per character. (Details in a later post.)
"""
uppercase = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
lowercase = 'abcdefghijklmnopqrstuvwxyz'
numbers = '0123456789'
(DC1, DC2, DC3, DC4, ESC) = ('\x11', '\x12', '\x13', '\x14', '\x1b')
def chars(infile):
for line in infile:
for char in line:
yield char
def decode(infile):
current_state = lowercase
capitalize_next_char = False
passthrough_next_char = False
for char in chars(infile):
if passthrough_next_char:
yield char
passthrough_next_char = False
elif char in lowercase:
if capitalize_next_char:
yield uppercase[lowercase.index(char)]
capitalize_next_char = False
else:
yield current_state[lowercase.index(char)]
elif char == DC1:
current_state = lowercase
elif char == DC2:
current_state = uppercase
elif char == DC3:
current_state = numbers
elif char == DC4:
capitalize_next_char = True
elif char == ESC:
passthrough_next_char = True
else:
yield char
def ok(a, b): assert a == b, (a, b)
ok(''.join(decode('this \x12is \x11a \x14test \x13bcd! \x1b\x12')),
'this IS a Test 123! \x12')
def chunks(infile):
"Break an infile into chunks such that any capital starts a new chunk."
buffer = []
for char in chars(infile):
if char in uppercase:
yield buffer
buffer = [char]
else:
buffer.append(char)
yield buffer
def encode(infile):
current_state = lowercase
for chunk in chunks(infile):
if (current_state is not uppercase and
chunk and
chunk[0] in uppercase
and any(letter in lowercase for letter in chunk)):
yield DC4
chunk[0] = chunk[0].lower()
for char in chunk:
if char in (DC1, DC2, DC3, DC4, ESC):
yield ESC
elif char in current_state:
pass
elif char in lowercase:
yield DC1
current_state = lowercase
elif char in uppercase:
yield DC2
current_state = uppercase
elif char in numbers:
yield DC3
current_state = numbers
if char in current_state:
yield lowercase[current_state.index(char)]
else:
yield char
ok(''.join(encode('this IS a Test 123! \x12')),
'this \x12is \x11a \x14test \x13bcd! \x1b\x12')
_me = file(__file__).read()
ok(''.join(decode(encode(_me))), _me)
if __name__ == '__main__':
import sys
decoding = (len(sys.argv) > 1 and sys.argv[1] == '-d')
if decoding: sys.argv.pop(1)
function = decode if decoding else encode
infile = file(sys.argv[1]) if len(sys.argv) > 1 else sys.stdin
sys.stdout.writelines(function(infile))
--
To unsubscribe: http://lists.canonical.org/mailman/listinfo/kragen-hacks