《数据结构》05-树9 Huffman Codes

古城微笑少年丶 2022-04-18 05:29 182阅读 0赞

# 题目 #

In 1953, David A. Huffman published his paper “A Method for the Construction of Minimum-Redundancy Codes”, and hence printed his name in the history of computer science. As a professor who gives the final exam problem on Huffman codes, I am encountering a big problem: the Huffman codes are NOT unique. For example, given a string “aaaxuaxz”, we can observe that the frequencies of the characters ‘a’, ‘x’, ‘u’ and ‘z’ are 4, 2, 1 and 1, respectively. We may either encode the symbols as \{‘a’=0, ‘x’=10, ‘u’=110, ‘z’=111\}, or in another way as \{‘a’=1, ‘x’=01, ‘u’=001, ‘z’=000\}, both compress the string into 14 bits. Another set of code can be given as \{‘a’=0, ‘x’=11, ‘u’=100, ‘z’=101\}, but \{‘a’=0, ‘x’=01, ‘u’=011, ‘z’=001\} is NOT correct since “aaaxuaxz” and “aazuaxax” can both be decoded from the code 00001011001001. The students are submitting all kinds of codes, and I need a computer program to help me determine which ones are correct and which ones are not.

**Input Specification:**  
Each input file contains one test case. For each case, the first line gives an integer N (2≤N≤63), then followed by a line that contains all the N distinct characters and their frequencies in the following format:

> c\[1\] f\[1\] c\[2\] f\[2\] … c\[N\] f\[N\]

where c\[i\] is a character chosen from \{‘0’ - ‘9’, ‘a’ - ‘z’, ‘A’ - ‘Z’, ‘\_’\}, and f\[i\] is the frequency of c\[i\] and is an integer no more than 1000. The next line gives a positive integer M (≤1000), then followed by M student submissions. Each student submission consists of N lines, each in the format:

> c\[i\] code\[i\]

where c\[i\] is the i-th character and code\[i\] is an non-empty string of no more than 63 '0’s and '1’s.

**Output Specification:**  
For each test case, print in each line either “Yes” if the student’s submission is correct, or “No” if not.

Note: The optimal solution is not necessarily generated by Huffman algorithm. Any prefix code with code length being optimal is considered correct.

**Sample Input:**

> 7  
> A 1 B 1 C 1 D 3 E 3 F 6 G 6  
> 4  
> A 00000  
> B 00001  
> C 0001  
> D 001  
> E 01  
> F 10  
> G 11  
> A 01010  
> B 01011  
> C 0100  
> D 011  
> E 10  
> F 11  
> G 00  
> A 000  
> B 001  
> C 010  
> D 011  
> E 100  
> F 101  
> G 110  
> A 00000  
> B 00001  
> C 0001  
> D 001  
> E 00  
> F 10  
> G 11

**Sample Output:**

> Yes  
> Yes  
> No  
> No

# 分析 #

大概题意就是根据输入的字符和频率判断给出的字符编码是否最优编码

提交错误改了一晚上，直到我发现样例输出是"Yes"和"No"，而不是“yes”和"no"…

## 解法一 ##

纯粹地模拟，把哈夫曼树建起来，再根据每个学生的提交建树，最后判断是否最优。  
循规蹈矩地自己建最小堆存哈夫曼树，最小堆的插入，最小堆的删除，最小堆的初始化，哈夫曼树的创建。  
然后正戏开始，当输入字符为 ‘0’，建立左树，输入字符为 ‘1’，建立右树，然后在输入字符串结束的地方存放它的权值，当一组数据输入完成，就能得到一棵树，再验证该树的结点个数，WPL 是否和最优的哈夫曼树相同

#include<cstdio>
    #include<cstdlib>
    #include<string>
    #include<iostream>
    #include<map>
    #define HeapCapacity 64
    #define MinData 0
    typedef struct TreeNode *HuffmanTree;
    typedef struct Heap *MinHeap;
    struct Heap{
           // 堆 
    	HuffmanTree *data;  // 存哈夫曼树 
    	int size; // 堆的当前大小 
    };
    struct TreeNode{
           // 哈夫曼树 
    	int weight;  // 频率
    	HuffmanTree left; 
    	HuffmanTree right; 
    };
    using namespace std;
    
    MinHeap createHeap();   // 建堆 
    HuffmanTree createHuffman();  // 建哈夫曼树 
    void sortHeap(MinHeap H,int i); // 调整子最小堆 
    void adjust(MinHeap H);  // 调整堆 
    MinHeap InitHeap(int n); // 初始化堆 
    HuffmanTree Delete(MinHeap H); // 堆的删除 
    void Insert(MinHeap H,HuffmanTree Huff); // 堆的插入 
    HuffmanTree Huffman(MinHeap H);  // 哈夫曼树的构造 
    int WPL(HuffmanTree Huff,int depth); // 计算 HuffmanTree 的编码长度 
    
    
    map<char,int> mappp;  // 保存字符到频率的映射关系 
    
    // 建堆 
    MinHeap createHeap(){
        
    	MinHeap H;
    	H = (MinHeap)malloc(sizeof(struct Heap));
    	H->data = (HuffmanTree *)malloc(sizeof(struct TreeNode) * HeapCapacity);
    	H->size = 0; 
    	// 设置哨兵
    	HuffmanTree Huff = createHuffman();
    	H->data[0] = Huff; 
    	return H;
    }
    
    // 建哈夫曼树 
    HuffmanTree createHuffman(){
        
    	HuffmanTree Huff;
    	Huff = (HuffmanTree)malloc(sizeof(struct TreeNode));
    	Huff->weight = MinData;   // 初始化成频率最小 
    	Huff->left = NULL;
    	Huff->right = NULL;
    	return Huff;
    }
    
    // 调整子最小堆 
    void sortHeap(MinHeap H,int i){
        
    	int parent,child;
    	HuffmanTree Huff = H->data[i]; // 拿到当前根结点的哈夫曼树
    	for(parent = i;parent*2<=H->size;parent = child){
        
    		// 左右儿子中挑小的 
    		child = parent * 2; 
    		if((child!=H->size) && (H->data[child+1]->weight < H->data[child]->weight))
    			child++;
    		// 没有更小的了，结束循环 
    		if(Huff->weight <= H->data[child]->weight)
    			break;
    		// 否则把儿子结点拿上来
    		H->data[parent] = H->data[child]; 
    	}
    	H->data[parent] = Huff;
    } 
    
    
    // 调整堆 
    void adjust(MinHeap H){
        
    	// 从第一个有孩子结点的结点开始调整 
    	for(int i=H->size/2;i>0;i--)
    		sortHeap(H,i); 
    }
    
    // 初始化堆 
    MinHeap InitHeap(int n){
        
    	MinHeap H =createHeap();
    	HuffmanTree Huff;
    	char c;  // 临时保存字符 
    	int f;  //  临时保存频率 
    	for(int i=0;i<n;i++){
        
    		getchar();
    		scanf("%c %d",&c,&f);
    		mappp.insert(pair<char,int>(c,f));  // 把字符和频率的映射关系存进map 
    		Huff = createHuffman();
    		Huff->weight = f;
    		H->data[++H->size] = Huff;
    	}
    	// 调整最小堆 
    	adjust(H);
    	return H;
    }
    
    // 堆的删除 
    HuffmanTree Delete(MinHeap H){
        
    	int parent,child;
    	HuffmanTree T = H->data[1]; // 拿到根结点的哈夫曼树 
    	HuffmanTree Huff = H->data[H->size--];  // 拿到最后一个位置的哈夫曼树 
    	for(parent = 1;parent*2<=H->size;parent = child){
        
    		// 左右儿子中挑小的 
    		child = parent * 2; 
    		if((child!=H->size) && (H->data[child+1]->weight < H->data[child]->weight))
    			child++;
    		// 没有更小的了，结束循环 
    		if(Huff->weight <= H->data[child]->weight)
    			break;
    		// 否则把儿子结点拿上来
    		H->data[parent] = H->data[child]; 
    	}
    	H->data[parent] = Huff;
    	return T;
    } 
    
    // 堆的插入
    void Insert(MinHeap H,HuffmanTree Huff){
        
    	int i = ++H->size; 
    	for(;Huff->weight < H->data[i/2]->weight;i/=2)
    		H->data[i] = H->data[i/2];
    	H->data[i] = Huff;
    } 
    
    // 哈夫曼树的构造 
    HuffmanTree Huffman(MinHeap H){
        
    	HuffmanTree Huff;
    	int times = H->size;
    	for(int i=1;i<times;i++){
        
    		Huff = createHuffman();
    		Huff->left = Delete(H);  // 从堆中删除一个结点，作为新 T 的左子结点 
    		Huff->right = Delete(H);  // 从堆中删除一个结点，作为新 T 的右子结点 
    		Huff->weight = Huff->left->weight + Huff->right->weight; // 重新计算权值 
    		Insert(H,Huff);   // 再加进堆中 
    	}
    	Huff = Delete(H);
    	return Huff;
    } 
    
    // 计算 HuffmanTree 的编码长度 
    int WPL(HuffmanTree Huff,int depth){
        
    	// 如果是叶结点，返回编码长度 
    	if(Huff->left==NULL && Huff->right==NULL)
    		return depth*Huff->weight;
    	else  // 否则返回其左右子结点的编码长度 
    		return (WPL(Huff->left,depth+1) + WPL(Huff->right,depth+1));
    }
    
    // 提交 
    void submit(int n,int codeLen){
        
    	HuffmanTree Huff = createHuffman();
    	HuffmanTree pre;
    	int counter = 1;
    	bool flag = true;
    	char ch;
    	string code; 
    	for(int i=0;i<n;i++){
        
    		getchar();
    		pre = Huff; 
    		// 读入每行 
    		scanf("%c",&ch);
    		cin>>code;
    		// 遍历编码 
    		for(int j=0;j<code.size();j++){
        
    			if(code[j]=='0'){
          // 如果当前编码为 0，左分支 
    				if(pre->left==NULL){
           // 如果左子树不存在，创建 
    					pre->left =createHuffman();
    					counter++;
    				}
    				if(pre->weight != 0)
    					flag =false;
    				pre = pre->left;
    			}else if(code[j]=='1'){
         // 如果当前编码为 0，左分支 
    				if(pre->right==NULL){
           // 如果左子树不存在，创建 
    					pre->right = createHuffman();
    					counter++;
    				}
    				if(pre->weight != 0)
    					flag =false;
    				pre = pre->right;
    			}
    		}
    		if(pre->left || pre->right)
    			flag = false;
    		pre->weight = mappp[ch];   // 从 mapp 取出存的频率
    	}
    	if(counter!=2*n-1 || !flag || WPL(Huff,0) != codeLen){
         // 如果结点不是 2n-1 个  或者编码长度不相等 
    		printf("No\n");
    		return;
    	}else{
        
    		printf("Yes\n");
    		return;
    	}
    } 
    
    
    int main(){
        
    	int n,m;
    	scanf("%d",&n);
    	// 初始化最小堆 
    	MinHeap H = InitHeap(n);
    	// 初始化哈夫曼树 
    	HuffmanTree Huff = Huffman(H);
    	// 计算该哈夫曼树的编码长度 
    	int codeLen = WPL(Huff,0); 
    	scanf("%d",&m);
    	for(int i=0;i<m;i++){
        
    		submit(n,codeLen);
    	} 
    	return 0;
    }

## 算法二 ##

巧妙的算法，STL 中已经实现了最小堆，即 `priority_queue<int,vector<int>,greater<int> >q`，WPL 也不用按树高去算，只需要模拟每次出队两次，入队一次，每次把入队的值加起来，最终得到的值就是 WPL  
学生提交的 WPL 为输入的字符长度 \* 该字符频率总和，再看提交是否存在前缀即可判断提交是否为最优

#include<queue>
    #include<map> 
    #include<iostream>
    #include<algorithm>
    #define MaxSize 64
    using namespace std;
    priority_queue<int,vector<int>,greater<int> >q; // 定义优先队列，最前面的值最小
    map<char,int> mapp;  
    struct character{
        
    	char ch;   // 字符 
    	int fre;  // 频率 
    };
    struct huffmanTree{
        
    	char ch;  // 字符 
    	string str;  // 编码 
    };
    
    
    // 建树
    int bulidTree(int n,character c[]){
        
    	int weight = 0;
    	// 入队 
    	for(int i=0;i<n;i++)
    		q.push((c[i].fre));
    	while(q.size()>1){
         
    		// 取出堆顶元素 
    		int x = q.top();
    		// 弹出堆顶元素 
    		q.pop();
    		int y = q.top();
    		q.pop();
    		// 入堆 
    		q.push(x+y);
    		weight += x+y;  // 得到编码长度 
    		// 小权值会不断被加 
    	}
    	q.pop();
    	return weight;
    }
    bool cmp(huffmanTree a,huffmanTree b){
        
    	return a.str.size() < b.str.size();
    }
    
    // 判断是否为前缀
    bool isPrefix(huffmanTree code[],int n){
        
    	// 按字符串长度从小到大排序 
    	sort(code,code+n,cmp); 
    	for(int i=0;i<n;i++){
        
    		string str = code[i].str;
    		for(int j=i+1;j<n;j++){
         // 查找之后全部字符
    			// 如果短字符串与长字符串截取相同长度相等，即为前缀 
    			if(code[j].str.substr(0,str.size()) == str)
    				return true;
    		}
    	}
    	return false; 
    }
    
    void judge(int n,character c[],int weight){
        
    	// 返回 WPL 
    	huffmanTree code[MaxSize];
    	int codelen = 0;
    	for(int i=0;i<n;i++){
        
    		cin>>code[i].ch>>code[i].str;
    		// 编码长度等于编码长度*频率总和 
    		codelen += mapp[code[i].ch]*code[i].str.size();  
    	}
    	if(codelen != weight || isPrefix(code,n))
    		cout<<"No"<<endl;
    	else
    		cout<<"Yes"<<endl;
    } 
    
    int main(){
        
    	int n;
    	int m;
    	cin>>n;
    	character c[MaxSize];
    	for(int i=0;i<n;i++){
        
    		cin>>c[i].ch>>c[i].fre;
    		mapp[c[i].ch] = c[i].fre;
    	}
    	int weight = bulidTree(n,c);
    	cin>>m;
    	for(int i=0;i<m;i++)
    		judge(n,c,weight);
    	return 0;
    }