Description
Simulate Huffman coding, for lossless compression of a long text.
Show intermediate states of the memory usage and how exactly the data is stored.
Also compare the original text size with the final compressed version, to show the efficiency of compression.
Disclaimer
It misses an edge case where all the text contains just one unique character. The function will behave unexpectedly for such a text.
There is an inefficiency in the huffman_tree here. We don’t need to store the sum of the frequency counts of each subtree. We can save some spcace for the tree there. This was stored for the representational purposes only. To show how the tree actually looks like.
Complexity
Size of input text: n
Uniq chars in text: q
- Creating the tree:
- Time Complexity: O(
n
+qlog(q)
), counting frequency of uniq chars + creating the tree. - Space Used for tree: O(
q
), total2*q
nodes will be there - Auxiliary space: O(
q
), keeping the frequency table for unique chars
- Time Complexity: O(
- Creating the table: Uses dfs to reach all the leaf nodes and record the path to reach there against the leaf value in a hashmap
- Time Complexity: O(
q
) - Space used for table: O(
q
) - Reccursive stack space: O(
log(q)
), maximum depth of the tree ~ 9-10
- Time Complexity: O(
- Encoding:
- Time Complexity: O(
n
), read every char and replace it with corresponding bitpath - No extra space required than the table used for encoding
- Time Complexity: O(
- Decoding:
- Time Complexity: O(
n
), traverse the tree one step over each bit in the encoding. - No extra space required than the tree used for decoding
- Time Complexity: O(
Code
https://ideone.com/7BBdMz
Using queue based implementation for creating the huffman tree. This minimise the variance in the final coding. We will use two queues, one for storing leaves and other for storing internal nodes. We will generate the internal node by taking sum of the two minimum nodes from both the queues. We will use the node from leaves queue if there is a tie, that will ensure that the length of the final code for each leaf is minimum. Hence minimizing the variance.
Queue based implementation: https://ideone.com/AdUnrr
Encoded values can be seen alongside tree only: https://ideone.com/oIVSI5
Code
#include <iostream>
#include <unordered_map>
#include <map>
#include <vector>
#include <climits>
using namespace std;
struct Node{
int x;
Node* l;
Node* r;
Node(int X=0, Node* L=NULL, Node* R=NULL) : x(X), l(L), r(R) {}
};
Node* huffman_tree(const string &s){
if(!s.size()) return NULL;
unordered_map<char,int> h;
for(char c: s)
h[c]++;
multimap<int, Node*> v;
for(auto p: h)
v.insert({p.second, new Node(p.first)});
pair<int,Node*> p = *v.begin();
v.erase(v.begin());
while(!v.empty()){
auto it = v.begin();
v.insert({p.first+it->first, new Node(p.first+it->first,p.second,it->second)});
v.erase(it);
it = v.begin();
p = *it;
v.erase(it);
}
return p.second;
}
void huffman_helper(unordered_map<char,vector<bool>> &t, Node* node, vector<bool> v){
if(!(node->l||node->r)){
t[node->x] = v;
return;
}
v.push_back(false);
huffman_helper(t,node->l,v);
v.back() = true;
huffman_helper(t,node->r,v);
}
unordered_map<char,vector<bool>> huffman_table(Node* h){
unordered_map<char,vector<bool>> t;
if(!h) return t;
huffman_helper(t,h,vector<bool>());
return t;
}
void print(char c){
if(c=='\s') cout<<"\\s";
else if(c=='\t') cout<<"\\t";
else if(c=='\n') cout<<"\\n";
else cout<<c;
}
void print_binary_tree(Node* node, int depth=0, bool leaf_check=false){
// this will print a leaf node differently if the leaf_check is set to true
// print format for leaf can be defined in else block below.
if(!node) return;
print_binary_tree(node->r, depth+1, leaf_check);
string str = "";
for(int i=0; i<depth; i++)
cout<<"|\t";
if(!leaf_check||node->l||node->r) cout<<node->x;
else print(node->x); // print in this format if it's a leaf
cout<<endl;
print_binary_tree(node->l, depth+1, leaf_check);
return;
}
void print(const vector<bool> &v){
for(const auto &x: v)
cout<<x;
cout<<endl;
}
vector<bool> encode(const string &s, unordered_map<char,vector<bool>> &t){
vector<bool> v;
for(char c: s)
for(bool b: t[c])
v.push_back(b);
return v;
}
string decode(const vector<bool> &v, Node* h){
string s = "";
Node* node = h;
for(bool b: v){
node = b?node->r:node->l;
if(!(node->l||node->r)){
s.push_back(char(node->x));
node = h;
}
}
return s;
}
int main() {
// CAUTION: this method is an exception for the case
// when all text contains just one unique character.
// E.g. ccccccccccccccccccccccccccccccc, without any whitespace etc.
string s;
string p="";
while(getline(cin,s))
p += s + "\n";
Node* h = huffman_tree(p);
print_binary_tree(h,0,true);
cout<<endl<<endl;
unordered_map<char,vector<bool>> t = huffman_table(h);
int sum = 0;
for(auto p: t){
print(p.first);
cout<<'\t';
print(p.second);
sum += p.second.size();
}
cout<<endl;
cout<<"Size of text: text_length*char_size = ";
cout<<p.size()<<" * "<<sizeof(char)*CHAR_BIT<<" = ";
cout<<p.size()*sizeof(char)*CHAR_BIT<<endl;
cout<<"Size of table: uniq_chars*(char_size + avg_encoding_size) = ";
cout<<t.size()<<" * ("<<sizeof(char)*CHAR_BIT<<" + "<<sum<<"/"<<t.size()<<") = ";
cout<<t.size()*sizeof(char)*CHAR_BIT+sum<<endl;
vector<bool> v = encode(p,t);
cout<<"Size of encoded string: "<<v.size()<<endl;
print(v);
cout<<endl;
string text = decode(v,h);
cout<<"Check if the original text and decoded text matches? ";
cout<<(p==text?"Yes.":"No.")<<endl;
cout<<endl<<text<<endl;
return 0;
}
Ideone it!
#include <iostream>
#include <unordered_map>
#include <unordered_set>
#include <algorithm>
#include <map>
#include <vector>
#include <climits>
#include <queue>
using namespace std;
struct Node{
int x;
Node* l;
Node* r;
Node(int X=0, Node* L=NULL, Node* R=NULL) : x(X), l(L), r(R) {}
};
struct comp {
unordered_map<char,int> &h;
comp(unordered_map<char,int> &x) : h(x) {}
bool operator() (const char &a, const char &b) { return h[a]<h[b]; }
};
int min(queue<Node*> &leaves, queue<Node*> &nodes, unordered_map<char,int> &h, Node* &ans){
if(leaves.empty()&&nodes.empty()){
ans = NULL;
return 0;
}
bool l_or_n;
if(leaves.empty()||nodes.empty()) l_or_n = leaves.empty();
else l_or_n = nodes.front()->x < h[leaves.front()->x];
// tie breaker here ^^^^^^^^^^^^ leads to a minimum variance
if(l_or_n){
ans = nodes.front();
nodes.pop();
return ans->x;
}
else{
ans = leaves.front();
leaves.pop();
return h[ans->x];
}
}
Node* huffman_tree(const string &s){
if(!s.size()) return NULL;
unordered_map<char,int> h;
unordered_set<char> uniq_s;
for(char c: s)
h[c]++, uniq_s.insert(c);
vector<char> v(uniq_s.begin(), uniq_s.end());
sort(v.begin(), v.end(), comp(h));
queue<Node*> leaves;
queue<Node*> nodes;
for(char c: v)
leaves.push(new Node(c));
Node* node;
int node_f = min(leaves, nodes, h, node);
Node* temp;
int temp_f;
while(leaves.size()||nodes.size()){
temp_f = min(leaves, nodes, h, temp);
nodes.push(new Node(node_f+temp_f, node, temp));
node_f = min(leaves, nodes, h, node);
}
return node;
}
void huffman_helper(unordered_map<char,vector<bool>> &t, Node* node, vector<bool> v){
if(!(node->l||node->r)){
t[node->x] = v;
return;
}
v.push_back(false);
huffman_helper(t,node->l,v);
v.back() = true;
huffman_helper(t,node->r,v);
}
unordered_map<char,vector<bool>> huffman_table(Node* h){
unordered_map<char,vector<bool>> t;
if(!h) return t;
huffman_helper(t,h,vector<bool>());
return t;
}
void print(char c){
if(c=='\s') cout<<"\\s";
else if(c=='\t') cout<<"\\t";
else if(c=='\n') cout<<"\\n";
else cout<<c;
}
void print_binary_tree(Node* node, int depth=0, bool leaf_check=false){
// this will print a leaf node differently if the leaf_check is set to true
// print format for leaf can be defined in else block below.
if(!node) return;
print_binary_tree(node->r, depth+1, leaf_check);
string str = "";
for(int i=0; i<depth; i++)
cout<<"|\t";
if(!leaf_check||node->l||node->r) cout<<node->x;
else print(node->x); // print in this format if it's a leaf
cout<<endl;
print_binary_tree(node->l, depth+1, leaf_check);
return;
}
void print(const vector<bool> &v){
for(const auto &x: v)
cout<<x;
cout<<endl;
}
vector<bool> encode(const string &s, unordered_map<char,vector<bool>> &t){
vector<bool> v;
for(char c: s)
for(bool b: t[c])
v.push_back(b);
return v;
}
string decode(const vector<bool> &v, Node* h){
string s = "";
Node* node = h;
for(bool b: v){
node = b?node->r:node->l;
if(!(node->l||node->r)){
s.push_back(char(node->x));
node = h;
}
}
return s;
}
int main() {
// CAUTION: this method is an exception for the case
// when all text contains just one unique character.
// E.g. ccccccccccccccccccccccccccccccc, without any whitespace etc.
string s;
string p="";
while(getline(cin,s))
p += s + "\n";
Node* h = huffman_tree(p);
print_binary_tree(h,0,true);
cout<<endl<<endl;
unordered_map<char,vector<bool>> t = huffman_table(h);
int sum = 0;
for(auto p: t){
print(p.first);
cout<<'\t';
print(p.second);
sum += p.second.size();
}
cout<<endl;
cout<<"Size of text: text_length*char_size = ";
cout<<p.size()<<" * "<<sizeof(char)*CHAR_BIT<<" = ";
cout<<p.size()*sizeof(char)*CHAR_BIT<<endl;
cout<<"Size of table: uniq_chars*(char_size + avg_encoding_size) = ";
cout<<t.size()<<" * ("<<sizeof(char)*CHAR_BIT<<" + "<<sum<<"/"<<t.size()<<") = ";
cout<<t.size()*sizeof(char)*CHAR_BIT+sum<<endl;
vector<bool> v = encode(p,t);
cout<<"Size of encoded string: "<<v.size()<<endl;
print(v);
cout<<endl;
string text = decode(v,h);
cout<<"Check if the original text and decoded text matches? ";
cout<<(p==text?"Yes.":"No.")<<endl;
cout<<endl<<text<<endl;
return 0;
}
Ideone it!
#include <iostream>
#include <unordered_map>
#include <map>
#include <vector>
#include <climits>
using namespace std;
struct Node{
int x;
Node* l;
Node* r;
Node(int X=0, Node* L=NULL, Node* R=NULL) : x(X), l(L), r(R) {}
};
Node* huffman_tree(const string &s){
if(!s.size()) return NULL;
unordered_map<char,int> h;
for(char c: s)
h[c]++;
multimap<int, Node*> v;
for(auto p: h)
v.insert({p.second, new Node(p.first)});
pair<int,Node*> p = *v.begin();
v.erase(v.begin());
while(!v.empty()){
auto it = v.begin();
v.insert({p.first+it->first, new Node(p.first+it->first,p.second,it->second)});
v.erase(it);
it = v.begin();
p = *it;
v.erase(it);
}
return p.second;
}
void huffman_helper(unordered_map<char,vector<bool>> &t, Node* node, vector<bool> v){
if(!(node->l||node->r)){
t[node->x] = v;
return;
}
v.push_back(false);
huffman_helper(t,node->l,v);
v.back() = true;
huffman_helper(t,node->r,v);
}
unordered_map<char,vector<bool>> huffman_table(Node* h){
unordered_map<char,vector<bool>> t;
if(!h) return t;
huffman_helper(t,h,vector<bool>());
return t;
}
void print(char c){
if(c=='\s') cout<<"\\s";
else if(c=='\t') cout<<"\\t";
else if(c=='\n') cout<<"\\n";
else cout<<c;
}
void print(const vector<bool> &v){
for(const auto &x: v)
cout<<x;
}
class NodePrinter{
private:
unordered_map<char,vector<bool>> t;
int sum;
public:
NodePrinter(unordered_map<char,vector<bool>> &huffman_table) : t(huffman_table), sum(0) {
for(auto p: t)
sum += p.second.size();
}
int getSum(){
return sum;
}
void operator() (Node* node){
if(node->l||node->r){
cout<<node->x;
return;
}
print(char(node->x));
if(t.find(node->x)==t.end()) return;
cout<<"\t->\t";
print(t[node->x]);
}
};
void print_binary_tree(Node* node, NodePrinter &printer, int depth=0){
if(!node) return;
print_binary_tree(node->r, printer, depth+1);
string str = "";
for(int i=0; i<depth; i++)
cout<<"|\t";
printer(node);
cout<<endl;
print_binary_tree(node->l, printer, depth+1);
return;
}
vector<bool> encode(const string &s, unordered_map<char,vector<bool>> &t){
vector<bool> v;
for(char c: s)
for(bool b: t[c])
v.push_back(b);
return v;
}
string decode(const vector<bool> &v, Node* h){
string s = "";
Node* node = h;
for(bool b: v){
node = b?node->r:node->l;
if(!(node->l||node->r)){
s.push_back(char(node->x));
node = h;
}
}
return s;
}
int main() {
// CAUTION: this method is an exception for the case
// when all text contains just one unique character.
// E.g. ccccccccccccccccccccccccccccccc, without any whitespace etc.
string s;
string p="";
while(getline(cin,s))
p += s + "\n";
Node* h = huffman_tree(p);
unordered_map<char,vector<bool>> t = huffman_table(h);
NodePrinter np(t);
print_binary_tree(h,np);
cout<<endl<<endl;
int sum = np.getSum();
cout<<endl;
cout<<"Size of text: text_length*char_size = ";
cout<<p.size()<<" * "<<sizeof(char)*CHAR_BIT<<" = ";
cout<<p.size()*sizeof(char)*CHAR_BIT<<endl;
cout<<"Size of table: uniq_chars*(char_size + avg_encoding_size) = ";
cout<<t.size()<<" * ("<<sizeof(char)*CHAR_BIT<<" + "<<sum<<"/"<<t.size()<<") = ";
cout<<t.size()*sizeof(char)*CHAR_BIT+sum<<endl;
vector<bool> v = encode(p,t);
cout<<"Size of encoded string: "<<v.size()<<endl;
print(v);
cout<<endl;
cout<<endl;
string text = decode(v,h);
cout<<"Check if the original text and decoded text matches? ";
cout<<(p==text?"Yes.":"No.")<<endl;
cout<<endl<<text<<endl;
return 0;
}