#!/usr/bin/env python
## (c) David MacKay - Free software. License: GPL
## For license statement see http://www.gnu.org/copyleft/gpl.html
"""
This is a RunLength Encoding compression algorithm
that uses the Huffman algorithm to define a code
for runlengths.
The package contains the following functions:
findprobs(f=0.01,N=69):
Find probabilities of all the events
- No 0s followed by a 1;
- 1 0 followed by a 1;
- 2 0s followed by a 1;
- - - - - - - -
- (N-1) 0s followed by a 1;
- N 0s
RLencode(string,symbols,N):
Reads in a string of 0s and 1s.
Creates a list of run lengths and special symbols 'Z',
which denote 'a lot of Zeroes'. ('A lot' is N.)
Sends this list to the general-purpose Huffman encoder
defined by the nodes in the list "symbols".
RLdecode(string,root,N):
Decode a binary string into runs, then return appropriate 0s and 1s
compress_it( inputfile, outputfile ):
Make Huffman code using runlengths, and compress
uncompress_it( inputfile, outputfile ):
Make Huffman code using runlengths, and uncompress
There are also three test functions.
If RLE.py is run from a terminal, it invokes compress_it (using stdin)
or uncompress_it (using stdin), respectively if there are zero arguments
or one argument.
"""
## /home/mackay/python/compression/huffman/Huffman3.py
## This supplies the huffman algorithm, complete with encoders and decoders:
from Huffman3 import *
verbose=0
def findprobs(f=0.01,N=69):
""" Find probabilities of all the events
- No 0s followed by a 1;
- 1 0 followed by a 1;
- 2 0s followed by a 1;
- 3 0s followed by a 1;
- - -
- (N-1) 0s followed by a 1;
- N 0s
>>> print findprobs(0.1,3) # doctest:+ELLIPSIS
[('0', 0.1...), ('1', 0.09...), ('2', 0.081...), ('Z', 0.729...)]
"""
answer = []
for n in range(N):
answer.append( (`n`, f * (1-f)**n) )
pass
answer.append( ('Z', (1-f)**N) )
assert ( len(answer) == N+1 )
return answer
def RLencode(string,symbols,N):
"""
Reads in a string of 0s and 1s.
Creates a list of run lengths and special symbols 'Z',
which denote 'a lot of Zeroes'. ('A lot' is N.)
Sends this list to the general-purpose Huffman encoder
defined by the nodes in the list "symbols".
"""
chars = list(string)
runs = [] ## list of run lengths
r = 0 ## the current run length
for c in chars:
if ( c == '0' ):
r += 1
if (r>=N):
runs.append('Z')
r=0
pass
pass
else:
runs.append(`r`)
r=0
pass
pass
runs.append(`r`) ## pretend that there is a final 1. The decoder must strip this off.
if verbose:
print "runs to be encoded:"
print runs
pass
zipped = encode( runs , symbols )
return zipped
def RLdecode(string,root,N):
"""
Decode a binary string into runs, then return appropriate 0s and 1s
"""
answer = decode( string, root )
if verbose:
print "runs from decoder:"
print answer
pass
output = ""
for r in answer:
if ( r == 'Z' ):
output = output + '0'*N
else:
temporary = output + '0'*int(r)
output = temporary + '1'
pass
## strip off the final '1'
assert ( r != 'Z' ) ## the output at this point should always end with a 1
output = temporary ## remove that final 1
return output
def easytest():
N=5
f=0.01
probs = findprobs(f,N)
symbols = makenodes(probs) # makenodes is defined at the bottom of Huffman3 package
root = iterate(symbols) # make huffman code and put it into the symbols' nodes, and return the root of the decoding tree
symbols.sort(lambda x, y: cmp(x.index, y.index)) # sort by index
for co in symbols : # and write the answer
co.report()
zipped = encode(['0','1','2','Z','2','1','0'], symbols)
print zipped
answer = decode( zipped, root )
print answer
for s in [ "00000000000001",\
"00000000000011",\
"00000000000011",\
"00000000000111",\
"0000000000110",\
"0000000000111100000000",\
"0000000000001111000000000",\
"01", "1000" , "0000000000001000001000000001100010" ,\
"0000000000000000000000000000000000000000010000000000000000000000000000000000000001000000000000000000000001110" ]:
print "=== encoding", s
zipped = RLencode( s, symbols, N )
print zipped
output = RLdecode( zipped, root, N)
print output
if (s==output):
print "OK!"
else:
print s
print output
print "ERROR"
pass
pass
pass
def test():
easytest()
import doctest
verbose=1
if(verbose):
doctest.testmod(None,None,None,True)
else:
doctest.testmod()
hardertest()
pass
def hardertest():
print "Reading the BentCoinFile"
inputfile = open( "/home/mackay/compress/BentCoinFile" , "r" )
outputfile = open( "tmp.zip" , "w" )
print "Compressing to tmp.zip"
compress_it(inputfile, outputfile)
outputfile.close(); inputfile.close()
# print "DONE compressing"
inputfile = open( "tmp.zip" , "r" )
outputfile = open( "tmp2" , "w" )
print "Uncompressing to tmp2"
uncompress_it(inputfile, outputfile)
outputfile.close(); inputfile.close()
# print "DONE uncompressing"
print "Checking for differences"
import os
os.system( "diff /home/mackay/compress/BentCoinFile tmp2" )
pass
f=0.01; N=69
def compress_it( inputfile, outputfile ):
"""
Make Huffman code using runlengths, and
Compress from file (possibly stdin).
"""
probs = findprobs(f,N)
symbols = makenodes(probs)
root = iterate(symbols) # make huffman code and put it into the symbols' nodes, and return the root of the decoding tree
string = inputfile.read()
outputfile.write( RLencode(string, symbols, N) )
pass
def uncompress_it( inputfile, outputfile ):
"""
Make Huffman code using runlengths, and
UNCompress from file (possibly stdin).
"""
probs = findprobs(f,N)
symbols = makenodes(probs)
root = iterate(symbols) # make huffman code and put it into the symbols' nodes, and return the root of the decoding tree
string = inputfile.read()
outputfile.write( RLdecode(string, root, N) )
pass
if __name__ == '__main__':
print "testing runlength encoder"
test()
pass