ideone

useful pieces of codes

View on GitHub

Description

Simulate Huffman coding, for lossless compression of a long text.
Show intermediate states of the memory usage and how exactly the data is stored.
Also compare the original text size with the final compressed version, to show the efficiency of compression.

Disclaimer

It misses an edge case where all the text contains just one unique character. The function will behave unexpectedly for such a text.
There is an inefficiency in the huffman_tree here. We don’t need to store the sum of the frequency counts of each subtree. We can save some spcace for the tree there. This was stored for the representational purposes only. To show how the tree actually looks like.

Complexity

Size of input text: n
Uniq chars in text: q

Code

https://ideone.com/7BBdMz

Using queue based implementation for creating the huffman tree. This minimise the variance in the final coding. We will use two queues, one for storing leaves and other for storing internal nodes. We will generate the internal node by taking sum of the two minimum nodes from both the queues. We will use the node from leaves queue if there is a tie, that will ensure that the length of the final code for each leaf is minimum. Hence minimizing the variance.

Queue based implementation: https://ideone.com/AdUnrr

Encoded values can be seen alongside tree only: https://ideone.com/oIVSI5

Code

#include <iostream>
#include <unordered_map>
#include <map>
#include <vector>
#include <climits>
using namespace std;
struct Node{
	int x;
	Node* l;
	Node* r;
	Node(int X=0, Node* L=NULL, Node* R=NULL) : x(X), l(L), r(R) {}
};
Node* huffman_tree(const string &s){
	if(!s.size())	return NULL;
	unordered_map<char,int> h;
	for(char c: s)
		h[c]++;
	multimap<int, Node*> v;
	for(auto p: h)
		v.insert({p.second, new Node(p.first)});
	pair<int,Node*> p = *v.begin();
	v.erase(v.begin());
	while(!v.empty()){
		auto it = v.begin();
		v.insert({p.first+it->first, new Node(p.first+it->first,p.second,it->second)});
		v.erase(it);
		it = v.begin();
		p = *it;
		v.erase(it);
	}
	return p.second;
}
void huffman_helper(unordered_map<char,vector<bool>> &t, Node* node, vector<bool> v){
	if(!(node->l||node->r)){
		t[node->x] = v;
		return;
	}
	v.push_back(false);
	huffman_helper(t,node->l,v);
	v.back() = true;
	huffman_helper(t,node->r,v);
}
unordered_map<char,vector<bool>> huffman_table(Node* h){
	unordered_map<char,vector<bool>> t;
	if(!h)	return t;
	huffman_helper(t,h,vector<bool>());
	return t;
}
void print(char c){
	if(c=='\s')	cout<<"\\s";
	else if(c=='\t')	cout<<"\\t";
	else if(c=='\n')	cout<<"\\n";
	else cout<<c;
}
void print_binary_tree(Node* node, int depth=0, bool leaf_check=false){
	// this will print a leaf node differently if the leaf_check is set to true
	// print format for leaf can be defined in else block below.
	if(!node)	return;
	print_binary_tree(node->r, depth+1, leaf_check);
	string str = "";
	for(int i=0; i<depth; i++)
		cout<<"|\t";
	if(!leaf_check||node->l||node->r)	cout<<node->x;
	else	print(node->x);	// print in this format if it's a leaf
	cout<<endl;
	print_binary_tree(node->l, depth+1, leaf_check);
	return;
}
void print(const vector<bool> &v){
	for(const auto &x: v)
		cout<<x;
	cout<<endl;
}
vector<bool> encode(const string &s, unordered_map<char,vector<bool>> &t){
	vector<bool> v;
	for(char c: s)
		for(bool b: t[c])
			v.push_back(b);
	return v;
}
string decode(const vector<bool> &v, Node* h){
	string s = "";
	Node* node = h;
	for(bool b: v){
		node = b?node->r:node->l;
		if(!(node->l||node->r)){
			s.push_back(char(node->x));
			node = h;
		}
	}
	return s;
}
int main() {
	// CAUTION: this method is an exception for the case 
	// when all text contains just one unique character.
	// E.g. ccccccccccccccccccccccccccccccc, without any whitespace etc.
	string s;
	string p="";
	while(getline(cin,s))
		p += s + "\n";
	Node* h = huffman_tree(p);
	print_binary_tree(h,0,true);
	cout<<endl<<endl;
	unordered_map<char,vector<bool>> t = huffman_table(h);
	int sum = 0;
	for(auto p: t){
		print(p.first);
		cout<<'\t';
		print(p.second);
		sum += p.second.size();
	}
	cout<<endl;
	cout<<"Size of text: text_length*char_size = ";
	cout<<p.size()<<" * "<<sizeof(char)*CHAR_BIT<<" = ";
	cout<<p.size()*sizeof(char)*CHAR_BIT<<endl;
	cout<<"Size of table: uniq_chars*(char_size + avg_encoding_size) = ";
	cout<<t.size()<<" * ("<<sizeof(char)*CHAR_BIT<<" + "<<sum<<"/"<<t.size()<<") = ";
	cout<<t.size()*sizeof(char)*CHAR_BIT+sum<<endl;
	vector<bool> v = encode(p,t);
	cout<<"Size of encoded string: "<<v.size()<<endl;
	print(v);
	cout<<endl;
	string text = decode(v,h);
	cout<<"Check if the original text and decoded text matches? ";
	cout<<(p==text?"Yes.":"No.")<<endl;
	cout<<endl<<text<<endl;
	return 0;
}

Ideone it!

#include <iostream>
#include <unordered_map>
#include <unordered_set>
#include <algorithm>
#include <map>
#include <vector>
#include <climits>
#include <queue>
using namespace std;
struct Node{
	int x;
	Node* l;
	Node* r;
	Node(int X=0, Node* L=NULL, Node* R=NULL) : x(X), l(L), r(R) {}
};
struct comp {
	unordered_map<char,int> &h;
	comp(unordered_map<char,int> &x) : h(x) {}
	bool operator() (const char &a, const char &b) { return h[a]<h[b]; }
};
int min(queue<Node*> &leaves, queue<Node*> &nodes, unordered_map<char,int> &h, Node* &ans){
	if(leaves.empty()&&nodes.empty()){
		ans = NULL;
		return 0;
	}
	bool l_or_n;
	if(leaves.empty()||nodes.empty())	l_or_n = leaves.empty();
	else	l_or_n = nodes.front()->x < h[leaves.front()->x];
	//			tie breaker here ^^^^^^^^^^^^ leads to a minimum variance
	if(l_or_n){
		ans = nodes.front();
		nodes.pop();
		return ans->x;
	}
	else{
		ans = leaves.front();
		leaves.pop();
		return h[ans->x];
	}
}
Node* huffman_tree(const string &s){
	if(!s.size())	return NULL;
	unordered_map<char,int> h;
	unordered_set<char> uniq_s;
	for(char c: s)
		h[c]++, uniq_s.insert(c);
	vector<char> v(uniq_s.begin(), uniq_s.end());
	sort(v.begin(), v.end(), comp(h));
	queue<Node*> leaves;
	queue<Node*> nodes;
	for(char c: v)
		leaves.push(new Node(c));
	Node* node;
	int node_f = min(leaves, nodes, h, node);
	Node* temp;
	int temp_f;
	while(leaves.size()||nodes.size()){
		temp_f = min(leaves, nodes, h, temp);
		nodes.push(new Node(node_f+temp_f, node, temp));
		node_f = min(leaves, nodes, h, node);
	}
	return node;
}
void huffman_helper(unordered_map<char,vector<bool>> &t, Node* node, vector<bool> v){
	if(!(node->l||node->r)){
		t[node->x] = v;
		return;
	}
	v.push_back(false);
	huffman_helper(t,node->l,v);
	v.back() = true;
	huffman_helper(t,node->r,v);
}
unordered_map<char,vector<bool>> huffman_table(Node* h){
	unordered_map<char,vector<bool>> t;
	if(!h)	return t;
	huffman_helper(t,h,vector<bool>());
	return t;
}
void print(char c){
	if(c=='\s')	cout<<"\\s";
	else if(c=='\t')	cout<<"\\t";
	else if(c=='\n')	cout<<"\\n";
	else cout<<c;
}
void print_binary_tree(Node* node, int depth=0, bool leaf_check=false){
	// this will print a leaf node differently if the leaf_check is set to true
	// print format for leaf can be defined in else block below.
	if(!node)	return;
	print_binary_tree(node->r, depth+1, leaf_check);
	string str = "";
	for(int i=0; i<depth; i++)
		cout<<"|\t";
	if(!leaf_check||node->l||node->r)	cout<<node->x;
	else	print(node->x);	// print in this format if it's a leaf
	cout<<endl;
	print_binary_tree(node->l, depth+1, leaf_check);
	return;
}
void print(const vector<bool> &v){
	for(const auto &x: v)
		cout<<x;
	cout<<endl;
}
vector<bool> encode(const string &s, unordered_map<char,vector<bool>> &t){
	vector<bool> v;
	for(char c: s)
		for(bool b: t[c])
			v.push_back(b);
	return v;
}
string decode(const vector<bool> &v, Node* h){
	string s = "";
	Node* node = h;
	for(bool b: v){
		node = b?node->r:node->l;
		if(!(node->l||node->r)){
			s.push_back(char(node->x));
			node = h;
		}
	}
	return s;
}
int main() {
	// CAUTION: this method is an exception for the case 
	// when all text contains just one unique character.
	// E.g. ccccccccccccccccccccccccccccccc, without any whitespace etc.
	string s;
	string p="";
	while(getline(cin,s))
		p += s + "\n";
	Node* h = huffman_tree(p);
	print_binary_tree(h,0,true);
	cout<<endl<<endl;
	unordered_map<char,vector<bool>> t = huffman_table(h);
	int sum = 0;
	for(auto p: t){
		print(p.first);
		cout<<'\t';
		print(p.second);
		sum += p.second.size();
	}
	cout<<endl;
	cout<<"Size of text: text_length*char_size = ";
	cout<<p.size()<<" * "<<sizeof(char)*CHAR_BIT<<" = ";
	cout<<p.size()*sizeof(char)*CHAR_BIT<<endl;
	cout<<"Size of table: uniq_chars*(char_size + avg_encoding_size) = ";
	cout<<t.size()<<" * ("<<sizeof(char)*CHAR_BIT<<" + "<<sum<<"/"<<t.size()<<") = ";
	cout<<t.size()*sizeof(char)*CHAR_BIT+sum<<endl;
	vector<bool> v = encode(p,t);
	cout<<"Size of encoded string: "<<v.size()<<endl;
	print(v);
	cout<<endl;
	string text = decode(v,h);
	cout<<"Check if the original text and decoded text matches? ";
	cout<<(p==text?"Yes.":"No.")<<endl;
	cout<<endl<<text<<endl;
	return 0;
}

Ideone it!

#include <iostream>
#include <unordered_map>
#include <map>
#include <vector>
#include <climits>
using namespace std;
struct Node{
	int x;
	Node* l;
	Node* r;
	Node(int X=0, Node* L=NULL, Node* R=NULL) : x(X), l(L), r(R) {}
};
Node* huffman_tree(const string &s){
	if(!s.size())	return NULL;
	unordered_map<char,int> h;
	for(char c: s)
		h[c]++;
	multimap<int, Node*> v;
	for(auto p: h)
		v.insert({p.second, new Node(p.first)});
	pair<int,Node*> p = *v.begin();
	v.erase(v.begin());
	while(!v.empty()){
		auto it = v.begin();
		v.insert({p.first+it->first, new Node(p.first+it->first,p.second,it->second)});
		v.erase(it);
		it = v.begin();
		p = *it;
		v.erase(it);
	}
	return p.second;
}
void huffman_helper(unordered_map<char,vector<bool>> &t, Node* node, vector<bool> v){
	if(!(node->l||node->r)){
		t[node->x] = v;
		return;
	}
	v.push_back(false);
	huffman_helper(t,node->l,v);
	v.back() = true;
	huffman_helper(t,node->r,v);
}
unordered_map<char,vector<bool>> huffman_table(Node* h){
	unordered_map<char,vector<bool>> t;
	if(!h)	return t;
	huffman_helper(t,h,vector<bool>());
	return t;
}
void print(char c){
	if(c=='\s')	cout<<"\\s";
	else if(c=='\t')	cout<<"\\t";
	else if(c=='\n')	cout<<"\\n";
	else cout<<c;
}
void print(const vector<bool> &v){
	for(const auto &x: v)
		cout<<x;
}
class NodePrinter{
private:
	unordered_map<char,vector<bool>> t;
	int sum;
public:
	NodePrinter(unordered_map<char,vector<bool>> &huffman_table) : t(huffman_table), sum(0) {
		for(auto p: t)
			sum += p.second.size();
	}
	int getSum(){
		return sum;
	}
	void operator() (Node* node){
		if(node->l||node->r){
			cout<<node->x;
			return;
		}
		print(char(node->x));
		if(t.find(node->x)==t.end())	return;
		cout<<"\t->\t";
		print(t[node->x]);
	}
};
void print_binary_tree(Node* node, NodePrinter &printer, int depth=0){
	if(!node)	return;
	print_binary_tree(node->r, printer, depth+1);
	string str = "";
	for(int i=0; i<depth; i++)
		cout<<"|\t";
	printer(node);
	cout<<endl;
	print_binary_tree(node->l, printer, depth+1);
	return;
}
vector<bool> encode(const string &s, unordered_map<char,vector<bool>> &t){
	vector<bool> v;
	for(char c: s)
		for(bool b: t[c])
			v.push_back(b);
	return v;
}
string decode(const vector<bool> &v, Node* h){
	string s = "";
	Node* node = h;
	for(bool b: v){
		node = b?node->r:node->l;
		if(!(node->l||node->r)){
			s.push_back(char(node->x));
			node = h;
		}
	}
	return s;
}
int main() {
	// CAUTION: this method is an exception for the case 
	// when all text contains just one unique character.
	// E.g. ccccccccccccccccccccccccccccccc, without any whitespace etc.
	string s;
	string p="";
	while(getline(cin,s))
		p += s + "\n";
	Node* h = huffman_tree(p);
	unordered_map<char,vector<bool>> t = huffman_table(h);
	NodePrinter np(t);
	print_binary_tree(h,np);
	cout<<endl<<endl;
	int sum = np.getSum();
	cout<<endl;
	cout<<"Size of text: text_length*char_size = ";
	cout<<p.size()<<" * "<<sizeof(char)*CHAR_BIT<<" = ";
	cout<<p.size()*sizeof(char)*CHAR_BIT<<endl;
	cout<<"Size of table: uniq_chars*(char_size + avg_encoding_size) = ";
	cout<<t.size()<<" * ("<<sizeof(char)*CHAR_BIT<<" + "<<sum<<"/"<<t.size()<<") = ";
	cout<<t.size()*sizeof(char)*CHAR_BIT+sum<<endl;
	vector<bool> v = encode(p,t);
	cout<<"Size of encoded string: "<<v.size()<<endl;
	print(v);
	cout<<endl;
	cout<<endl;
	string text = decode(v,h);
	cout<<"Check if the original text and decoded text matches? ";
	cout<<(p==text?"Yes.":"No.")<<endl;
	cout<<endl<<text<<endl;
	return 0;
}

Ideone it!