Reputation: 1753
I have a program that computes a ton of data and writes it to a file. My data is just a bunch of numbers from 0-16 (17 different values), and I have computed the frequency that each number appears in the data. I need to use as little disk space and as little RAM as possible, so I wrote a little Huffman encoding/decoding module in pure python that writes/read the compressed data with as few encoded symbols in memory at a time. Is there a module that comes with python that can do something similar? Here is the code with a little examples of how it will be used (WARNING the code is kinda long decently commented):
def makeTree(data):
"""data is a list of tuples, whos first entry is a priority/frequency
number, and whos second entry is tuple containing the data to
be encoded. The tree uses an internal tag system to tell where the
branch ends. (End nodes are terminated with a False)"""
def tag(data):
taggedData = []
for priority, datum in data:
#all of the initial data is an end branch
taggedData += [(priority, (datum, False))]
return taggedData
#get the tagged data into decending order of priority/frequency
tree = sorted(tag(data), reverse=True)
while len(tree)>1:
#get the two lowest priority branches
bottomR, bottomL = tree.pop(), tree.pop()
#and stick them together into a new branch with combined priority
new_elem = bottomR[0]+bottomL[0], ((bottomL, bottomR), True)
#then add them back to the list of branches and sort
tree += [new_elem]
tree.sort(reverse=True)
return tree[0]
def makeTable(tree, code=""):
"""Takes a tree such as generated by makeTree and returns a dictionary
of code:value pairs."""
#if this is an end branch, return code:value pair
if tree[1][1]==False:
return {code:tree[1][0]}
#otherwise find the tables for the left and right branches
#add them to the main table, and return
table = {}
table.update(makeTable(tree[1][0][0], code+'0')) #left
table.update(makeTable(tree[1][0][1], code+'1')) #right
return table
class Decoder:
"""this class creates a Decoder object which is used to decode a compressed
file using the appropriate decoding table (duh). It used to be a function,
but it was buggy and would also be ugly if I left it a function. (this
class was written After the Encdoer class.)
"""
def __init__(self, fname, table):
self.file = open(fname)
self.table = table
self.byte = None
self.bit = 7
self.newByte = True
def decode(self, size=1):
"""Decodes and yields size number of symbols from the file.
Size defaults to 1"""
#a counter for how many symbols were read
read = 0
code = ''
while read<size:
if self.newByte:
self.byte = ord(self.file.read(1))
for n in xrange(self.bit, -1, -1):
code += str((self.byte & 1<<n) >> n)
self.byte &= (1<<n)-1
if code in self.table:
yield self.table[code]
read += 1
code = ''
if read==size:
self.bit = n-1
self.newByte = False
raise StopIteration
self.bit = 7
self.newByte = True
def close(self):
self.file.close()
class Encoder:
"""This class creates an encoder object, which is used to write encoded data
to a file. It was initially going to be a function, but I couldn't
accomplish that without code getting really ugly. :p """
def __init__(self, fname, table):
self.file = open(fname, 'w')
self.table = table
self.code = ''
def encode(self, datum):
"""Attempts to write encoded datum to file. If their isn't enough
code to write a whole number amount of bytes, then the code is saved up
until there is."""
self.code += self.table[datum]
if len(self.code)%8==0:
self.__write_code_chunk()
return
def end_encode(self):
"""Writes any remaining code to the file, appending the code with
trailing zeros to fit within a byte, then closes the file."""
#if the length of the code remaining isn't a multiple of 8 bits
if len(self.code)%8:
#then add zeros to the end so that it is
self.code += "0"*(8 - len(self.code)%8)
self.__write_code_chunk()
self.file.close()
return
def __write_code_chunk(self):
bytes = len(self.code)/8
#for every byte (or every 8 bits)...
for _ in xrange(bytes):
#turn those bits into an number using int with base 2,
#then turn the number into an ascii character,
#and finally write the data to the file.
self.file.write(chr(int(self.code[:8], 2)))
#then get rid of the 8 bits just read
self.code = self.code[8:]
#make sure there is no code left over
assert self.code==''
return
if __name__=="__main__":
import random
mandelbrotData = [
(0.10776733333333334, 0),
(0.24859, 1),
(0.12407666666666667, 2),
(0.07718866666666667, 3),
(0.04594733333333333, 4),
(0.03356, 5),
(0.023286666666666664, 6),
(0.018338, 7),
(0.014030666666666667, 8),
(0.011918, 9),
(0.009500666666666668, 10),
(0.008396666666666667, 11),
(0.006936, 12),
(0.006365999999999999, 13),
(0.005466, 14),
(0.0048920000000000005, 15),
(0.2537393333333333, 16)]
decode_table = makeTable(makeTree(mandelbrotData))
encode_table = {val:key for key, val in decode_table.iteritems()}
approx_data = sum([[val]*int(round(freq*10**3/2)) for freq, val in mandelbrotData], [])
random.shuffle(approx_data)
testname = 'hufftest'
encoder = Encoder(testname, encode_table)
for val in approx_data:
encoder.encode(val)
encoder.end_encode()
decoder = Decoder(testname, decode_table)
decoded = list(decoder.decode(len(approx_data)/2))
decoded += list(decoder.decode(len(approx_data)/2))
print approx_data == decoded
Is there a module that can do something similar faster? If not, are there ways I can change my code to make it faster?
Upvotes: 3
Views: 392
Reputation: 123463
If your data is significantly repetitious, then you might want to try just run-length encoding it which might be a relatively fast operation. Here's an implementation of one as a generator which could help minimize its overall memory usage. Note that when a run is very short, it only outputs the value rather than a (repeat-count,value) tuple to avoid bloating the output and possibly making it longer than it was originally.
from itertools import groupby
def run_length_encode(data):
for value, igroup in groupby(data):
repeat_count = len(list(igroup))
yield value if repeat_count == 1 else repeat_count, value
if __name__ == '__main__':
""" generate some random data with repeats and encode it """
import random
DATA_SIZE = 20
MAX_VAL = 16
MAX_REPEAT = 5
data = []
while len(data) < DATA_SIZE:
val = random.randint(0, MAX_VAL)
repeat = min(DATA_SIZE-len(data), random.randint(0, MAX_REPEAT))
for _ in xrange(repeat): data.append(val)
print data
print [item for item in run_length_encode(data)]
Output:
[5, 5, 5, 9, 8, 8, 7, 5, 5, 5, 5, 5, 1, 7, 9, 16, 16, 16, 16, 16]
[(3, 5), 9, (2, 8), 7, (5, 5), 1, 7, 9, (5, 16)]
If the runs are very long, it might be better to explicitly count out how many are in each group iteratively instead of turning them into a lists and taking its length:
def ilen(iterable):
""" return the number of items in an iterable """
return sum(1 for _ in iterable)
def run_length_encode(data):
for value, igroup in groupby(data):
# repeat_count = len(list(igroup))
repeat_count = ilen(igroup)
yield value if repeat_count == 1 else repeat_count, value
Since the range of your data values is relatively small, you could (also) encode them into single-byte character values.
Upvotes: 0