Reputation: 57
I have written a Huffman Compression Algorithm in python, but so far I have only managed to do the compression part (by making a priority queue using heapq). I have created a class called HuffmanCoding which takes in the text to compress. I know that one way of decompressing is saving a dictionary with the characters and their codes to the compressed text file, but I am not sure about how I should do this (especially because my compressed text is stored in a binary file). Does anyone have any pointers or advice to how I could go about decompressing? Below is my compression code.
class HeapNode:
def __init__(self, char, freq):
self.char = char
self.freq = freq
self.left = None
self.right = None
def __lt__(self, other): # if the frequency of one character is lower than the frequency of another one
return self.freq < other.freq
def __eq__(self, other): # if two characters have the same frequencies
if other == None:
return False
if not isinstance(other, HeapNode): # checks if the character is a node or not
return False
return self.freq == other.freq
class HuffmanCoding:
def __init__(self, text_to_compress):
self.text_to_compress = text_to_compress # text that will be compressed
self.heap = []
self.codes = {} # will store the Huffman code of each character
def get_frequency(self): # method to find frequency of each character in text - RLE
frequency_Dictionary = {} # creates an empty dictionary where frequency of each character will be stored
for character in self.text_to_compress: # Iterates through the text to be compressed
if character in frequency_Dictionary:
frequency_Dictionary[character] = frequency_Dictionary[character] + 1 # if character already exists in
# dictionary, its value is increased by 1
else:
frequency_Dictionary[character] = 1 # if character is not present in list, its value is set to 1
return frequency_Dictionary
def make_queue(self, frequency): # creates the priority queue of each character and its associated frequency
for key in frequency:
node = HeapNode(key, frequency[key]) # create node (character) and store its frequency alongside it
heapq.heappush(self.heap, node) # Push the node into the heap
def merge_nodes(
self): # creates HuffmanTree by getting the two minimum nodes and merging them together, until theres
# only one node left
while len(self.heap) > 1:
node1 = heapq.heappop(self.heap) # pop node from top of heap
node2 = heapq.heappop(self.heap) # pop next node which is now at the top of heap
merged = HeapNode(None, node1.freq + node2.freq) # merge the two nodes we popped out from heap
merged.left = node1
merged.right = node2
heapq.heappush(self.heap, merged) # push merged node back into the heap
def make_codes(self, root, current_code): # Creates Huffman code for each character
if root == None:
return
if root.char != None:
self.codes[root.char] = current_code
self.make_codes(root.left, current_code + "0") # Every time you traverse left, add a 0 - Recursive Call
self.make_codes(root.right, current_code + "1") # Every time you traverse right, add a 1 - Recursive Call
def assignCodes(self): # Assigns codes to each character
root = heapq.heappop(self.heap) # extracts root node from heap
current_code = ""
self.make_codes(root, current_code)
def get_compressed_text(self, text): # Replaces characters in original text with codes
compressed_text = ""
for character in text:
compressed_text += self.codes[character]
return compressed_text
def pad_encoded_text(self, compressed_text):
extra_padding = 8 - len(compressed_text) % 8 # works out how much extra padding is required
for i in range(extra_padding):
compressed_text += "0" # adds the amount of 0's that are required
padded_info = "{0:08b}".format(extra_padding)
compressed_text = padded_info + compressed_text
return compressed_text
def make_byte_array(self, padded_text):
byte_array = bytearray()
for i in range(0, len(padded_text), 8):
byte_array.append(int(padded_text[i:i + 8], 2))
return(byte_array)
def show_compressed_text(self):
frequency = self.get_frequency()
self.make_queue(frequency)
self.merge_nodes()
self.assignCodes()
encoded_text = self.get_compressed_text(self.text_to_compress)
padded_encoded_text = self.pad_encoded_text(encoded_text)
byte_array = self.make_byte_array(padded_encoded_text)
return bytes(byte_array)
def remove_padding(self, padded_encoded_text): # removes the padding that was added
padded_info = padded_encoded_text[:8]
extra_padding = int(padded_info, 2)
padded_encoded_text = padded_encoded_text[8:]
encoded_text = padded_encoded_text[:-1 * extra_padding]
return encoded_text```
Upvotes: 0
Views: 1219
Reputation: 112374
First you need to send a description of the Huffman code along with the data. Second you need to use that on the decompression end to decode the Huffman codes.
The most straightforward way is to traverse the tree, sending a 1 bit for every node encountered and a 0 bit followed by the symbol bits for every leaf. On the decompression end you can read that and reconstruct the tree. Then you can use the tree, traversing to the left or right with each bit of data until you get to a leaf. Emit the symbol for that leaf, and then start back at the root of the tree for the next bit.
The way this is usually done in compression libraries is to discard the tree and instead keep and send just the code lengths for every symbol, compressing that as well. Then on both the compression and decompression side, the same canonical Huffman code is constructed from the lengths. Then a table look up can be used for decoding, for speed.
Upvotes: 2