姓名排序

发表于 2017-11-06 更新于 2023-08-20 分类于算法

姓名排序

题目描述：给定 30,000 个中国人的名字，选用一种线性排序方法有效排序，要求使用内存不超过 1G。
想法 1：计数排序，汉字组合显然是无穷多的，无法确定元素的范围，而且可能有重名的情况，所以不能使用；
想法 2：使用桶排序，不好确定每个桶的元素范围，一般情况下都是假定元素限制在(0,1]；
想法 3：使用基数排序可以一次按名字的第 i 位的排序，如果每个桶的标志为一个汉字，那桶就太多了，如果每个桶的标志为汉字的首字母，桶够少了，但是每个桶都需要大量时间，既然是汉字，我们完全可以用汉字拼音；

获得测试组

一组有 3w 行，每一行随机 2-10 个 utf-8 中文字符

从这里[1]了解到 unicode 字符的基本汉字编码范围为 4E00-9FA5，我的想法是随机生成一个 19968-40869 的 unsigned int 型数，然后编码成 utf-8 字符；

utf-8 字符可以由 unicode 字符转换而来，转换算法[2]为：

×××××××× ×××××××× -> 1110×××× 10×××××× 10××××××
1110拼上前4位
10拼上中间6位
10拼上后6位

输入到文件中

源码

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
//中文字为3字节，第一步
void randomname(unsigned char *name){
    int length = random() % 8 + 2;
    int pos;
    for(pos = 0; pos < length; pos++){
        unsigned int a = (random() % 20902) + 19968;
	       name[3 * pos] = (a >> 12) + 224;
	       name[3 * pos + 1] = ((a >> 6) % 64) + 128;
	       name[3 * pos + 2] = (a % 64) + 128;
    }
    name[3 * pos] = '\0';
    printf("%s", name);
}
void makefile(FILE *f){
    if(f == NULL){
        fprintf(stderr, "error1");
        exit(1);
    }
    unsigned char name[35];
    int count = 30000;
    srand(time(0));
    while(count--){
        randomname(name);
        if(fprintf(f, "%s%c", name, '\r') == 0){
            fprintf(stderr, "error");
	           break;
        }
    }
    fclose(f);
}
int main(void){
    FILE *f = fopen("a.txt", "w");
    makefile(f);
    return 0;
}

转码

我们希望按照拼音顺序排序，所以得先将 UTF-8 字符转换为 GB2312 编码

这里遇到了难题，我打算使用 linux 下的 iconv 进行转码，但是转出来的都是乱码，可能是对转码的工作原理还不明白。

刚开始我打算编写 C 语言代码转换 utf-8 -> gb2312，但是失败了，一是因为有些字符在 gb2312 中不存在，报错：

1 2	C语言中出现 iconv(pattern, pin, &inlen, pout, &outlen) 返回-1 命令行下出现 iconv: 未知 0 处的非法输入序列

二是我直接输出到了终端，结果出现了乱码：

你好 -> ����

是因为终端默认使用 utf-8，这样就无法显示了。

后来使用 GBK，因为 GBK 包括 gb2312，将编码后的字符直接输出到了文件中，这样就能看到编码后的结果了

代码：

#include <stdio.h>
#include <iconv.h>
#include <string.h>
#include <stdlib.h>
int code_convert(char *from_charset, char *to_charset, char *inbuf, size_t inlen, char *outbuf, size_t outlen){
    iconv_t pattern;
    char **pin = &inbuf;
    char **pout = &outbuf;
    pattern = iconv_open(to_charset, from_charset);
    if(pattern == 0){
        return -1;
    }
    memset(outbuf, 0, outlen);
    if(iconv(pattern, pin, &inlen, pout, &outlen) == -1){
        return -1;
    }
    iconv_close(pattern);
    return 0;
}
#define MAXNUM 30000
#define MAXNAME 31
char names_utf8[MAXNUM][MAXNAME];
char names_gb2312[MAXNUM][MAXNAME];
void readFile(char *fileName){
    FILE *f;
    if((f = fopen(fileName, "r")) == NULL){
        fprintf(stderr, "cant open file %s\n", fileName);
        exit(1);
    }
    
    int pos;
    for(pos = 0; pos < MAXNUM; pos++){
        //fgets(names_utf8[pos], MAXNAME, f);fgets会保留行末尾的\r\n
        fscanf(f, "%s", names_utf8[pos]);
    }
}
void convert(){
    FILE *f = fopen("b.txt", "w");
    int pos;
    for(pos = 0; pos < MAXNUM; pos++){
        if(code_convert("utf-8", "GBK", names_utf8[pos], strlen(names_utf8[pos]), names_gb2312[pos], MAXNAME) == -1){
	           fprintf(stderr, "iconv error");
	           exit(2);
        }
       fprintf(f, "%s\r\n", names_gb2312[pos]);
    }
    fclose(f);
}

这部分代码等价于以下命令：

1	sudo iconv -f utf-8 -t GBK a.txt -o b.txt

起初认为像 GBK 这样的汉字编码标准会很地道地按拼音顺序来编码，但是还是太天真了，因为有多音字存在等原因，要按拼音编码是不大可能的；

最终上网找了一个汉字转拼音的程序[4]，找拼音的大致思路是预先准备好拼音及该拼音的汉字，要找一个汉字的拼音就遍历一遍找，方法比较简单只覆盖了一部分汉字，而且是线性遍历效率不高，还有个缺点就是多音字只能找到第一个；

对姓名进行排序

最后就是使用基数排序了，因为使用 utf-8 编码，所以数组中 27-29 为第 10 个汉字，24-26 为第 9 个汉字，以此类推，姓名有长有短，比如“张三”的后 8 位都不存在，默认放入 0 号桶，其他情况如果能找到该位汉字的拼音，将姓名放到对应的桶中，如果找不到，默认该汉字未编码，放入 0 号桶，这里桶其实代表 a/an/ang…，最后为了保证排序的稳定性，桶使用了队列实现；

代码如下所示：
其中pinyin.h是，代码如这里所示。

#include "pinyin.h" //汉字-拼音
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#define P2HSIZE (sizeof(p2h) / sizeof(p2h[0]))
int getP2Hpos(char word[]){
    if(word == NULL || word[0] == '\0'){ return -1; }
    int pos;
    char *curHzString;
    for(pos = 0; pos < P2HSIZE; pos++){
        curHzString = p2h[pos].hz;
        if(strstr(curHzString, word)){
            return pos;
        }
    }
    return -1;
}
#define MAXNUM 30000 //文件中姓名的数量
#define MAXNAME 31 //姓名最多为10个汉字
char *names_utf8[MAXNUM];
/*******************************************/
void readFile(char *fileName){
    FILE *f;
    if((f = fopen(fileName, "r")) == NULL){
        fprintf(stderr, "cant open file %s\n", fileName);
        exit(1);
    }
    
    int pos;
    for(pos = 0; pos < MAXNUM; pos++){
        names_utf8[pos] = (char*)malloc(MAXNAME * sizeof(char));
        fscanf(f, "%s", names_utf8[pos]);
    }
}
void clearNames(){
    int pos;
    for(pos = 0; pos < MAXNUM; pos++){
        free(names_utf8[pos]);
    }
}
/****************************************/
typedef struct cell{
    int pos;
    struct cell *next;
}cell;
typedef struct bucket{
    cell *front;
    cell *rear;
}bucket;
int isEmpty(bucket *b){
    return b->rear == NULL;
}
cell *makeCell(int pos){
    cell *c = (cell*)malloc(sizeof(cell));
    if(c == NULL){ fprintf(stderr, "error"); }
    c->pos = pos, c->next = NULL;
    return c;
}
bucket *makeBucket(){
    bucket *b = (bucket*)malloc(sizeof(bucket));
    b->front = b->rear = NULL;
    return b;
}
int push(bucket *b, int pos){
    if(b == NULL){printf("error");
        return -1;
    }
    cell *c = makeCell(pos);
    if(isEmpty(b)){
        b->rear = b->front = c;
    }
    else{
        b->front->next = c;
        b->front = c;
    }
    return 0;
}
int pop(bucket *b){
    if(isEmpty(b)){
        return -1;
    }
    else{
        cell *rear = b->rear;
        int pos = rear->pos;
        b->rear = rear->next;
        free(rear);
        return pos;
    }
}
//桶的数量就是拼音的数量，0号桶用于装找不到拼音的汉字
#define BUCKETSNUM (P2HSIZE + 1)
bucket *buckets[BUCKETSNUM];
char tmp[4];
void pushBuckets(int radix){//1 <= radix <= 10，表示第几个字
    int pos, bucketId;
    for(pos = 0; pos < MAXNUM; pos++){//如果改成while(pos++ < MAXNUM)会出现段错误，为什么？？
        strncpy(tmp, names_utf8[pos] + 3 * (radix - 1), 3);
        bucketId = getP2Hpos(tmp) + 1;
        push(buckets[bucketId], pos);
    }
}
//桶中保存的是对应姓名数组的下标
void popBuckets(){
    char *names_tmp[MAXNUM];
    int npos;
    for(npos = 0; npos < MAXNUM; npos++){
        names_tmp[npos] = names_utf8[npos];
    }
    npos = 0;
    int bpos;
    for(bpos = 0; bpos < BUCKETSNUM; bpos++){
        while(!isEmpty(buckets[bpos])){
            int pos = pop(buckets[bpos]);
            names_utf8[npos++] = names_tmp[pos];
        }
    }
}
void initBuckets(){
    int pos;
    for(pos = 0; pos < BUCKETSNUM; pos++){
        buckets[pos] = makeBucket();
    }
}
void clearBuckets(){
    int pos;
    for(pos = 0; pos < BUCKETSNUM; pos++){
        while(pop(buckets[pos]) != -1);
    }
}
void radixSort(){
    int radix;
    for(radix = 10; radix >= 1; radix--){
        pushBuckets(radix);
        popBuckets();
    }
}
void writeResult(){
    FILE *f = fopen("result.dat", "w");
    if(f == NULL){ fprintf(stderr, "open file failed"); exit(1);}
    
    int pos;
    for(pos = 0; pos < MAXNUM; pos++){
        fprintf(f, "%s\r\n", names_utf8[pos]);
    }
    fclose(f);
}
void func(){
    readFile("a.txt");
    initBuckets();
    radixSort();
    writeResult();
    clearNames();
}
int main(void){
    func();
    return 0;
}

拓展

当需要排序的数据量很大，例如 100G 个人按照年龄排序，如何基于外存对这些数据排序？使用 mapreduce 等工具？

姓名排序

获得测试组

转码

对姓名进行排序

拓展

参考