22
2016
01

Hash table在MudOS中的实现

Hash table在MudOS中的实现来源网络,如有侵权请告知,即处理!


    本文介绍了 MudOS中使用的散 列函数,并对包装了散列函数的散 列表寻址操作做了一个简单的分析,最后模拟MudOS中object hash table实现了一个简 化的散列表。

    在MudOS中,散列 表的应用非常广泛,可以说凡是用到查找的地方都用到了散列(hash table),散列的好处在于它的 效率,理想状态下,搜索、插入、删除操作的时间均为O(1),在应用中,虽然达不到这样的理想状态,但相 比于其它数据结构来说,hash table的优势还是很明显的。在非理想状态下,不可避免的会遇到碰撞 问题(collision),MudOS源码中处理碰撞 所采用的方法是使用链表散列。

散列函数(hash function)

    在最新发布的MudOS源码(v22.2b14)中,散列函数(hash function)定义于hash.c:

/* hash.c */

/*

** A simple and fast generic string hasher based on Peter K.  Pearson's

** article in CACM 33-6, pp. 677.

*/

#define EDIT_SOURCE

#define NO_SOCKETS

#define NO_OPCODES

#include "std.h"

static int T[] =

{

    1, 87, 49, 12, 176, 178, 102, 166, 121, 193, 6, 84, 249,  230, 44, 163,

    14, 197, 213, 181, 161, 85, 218, 80, 64, 239, 24, 226,  236, 142, 38, 200,

    110, 177, 104, 103, 141, 253, 255, 50, 77, 101, 81, 18,  45, 96, 31, 222,

    25, 107, 190, 70, 86, 237, 240, 34, 72, 242, 20, 214, 244, 227, 149, 235,

    97, 234, 57, 22, 60, 250, 82, 175, 208, 5, 127, 199, 111,  62, 135, 248,

  174, 169, 211, 58, 66, 154, 106, 195, 245, 171, 17, 187,  182, 179, 0, 243,

 132, 56, 148, 75, 128, 133, 158, 100, 130, 126, 91, 13, 153,  246, 216, 219,

    119, 68, 223, 78, 83, 88, 201, 99, 122, 11, 92, 32, 136,  114, 52, 10,

    138, 30, 48, 183, 156, 35, 61, 26, 143, 74, 251, 94, 129,  162, 63, 152,

    170, 7, 115, 167, 241, 206, 3, 150, 55, 59, 151, 220, 90,  53, 23, 131,

    125, 173, 15, 238, 79, 95, 89, 16, 105, 137, 225, 224,  217, 160, 37, 123,

    118, 73, 2, 157, 46, 116, 9, 145, 134, 228, 207, 212, 202, 215, 69, 229,

    27, 188, 67, 124, 168, 252, 42, 4, 29, 108, 21, 247, 19,  205, 39, 203,

 233, 40, 186, 147, 198, 192, 155, 33, 164, 191, 98, 204, 165, 180, 117, 76,

    140, 36, 210, 172, 41, 54, 159, 8, 185, 232, 113, 196,  231, 47, 146, 120,

    51, 65, 28, 144, 254, 221, 93, 189, 194, 139, 112, 43, 71, 109, 184, 209,

};

INLINE int

hashstr P3(char *, s,           /* string to hash */

                  int, maxn,      /* maximum number of chars to consider */

                  int, hashs)

{

    register unsigned int h;

    register unsigned char *p;

    h = (unsigned char) *s;

    if (h) {

       if (hashs > 256) {

           register int oh = T[(unsigned char) *s];

           for (p = (unsigned char *) s + 1; *p

                     && p <= (unsigned char *) s + maxn; p++) {

              h = T[h ^ *p];

              oh = T[oh ^ *p];

           }

           h |= (oh << 8);

       } else

           for (p = (unsigned char *) s + 1; *p

                     && p <= (unsigned char *) s + maxn; p++)

              h = T[h ^ *p];

    }

    return (int) (h % hashs);

}

/*

 * whashstr is faster than hashstr, but requires an even  power-of-2 table size

 * Taken from Amylaars driver.

 */

INLINE int

whashstr P2(char *, s, int, maxn)

{

    register unsigned char oh, h;

    register unsigned char *p;

    register int i;

       if(!s)return 0;

    if (!*s)

              return 0;

    p = (unsigned char *) s;

    oh = T[*p];

    h = (*(p++) + 1) & 0xff;

    for (i = maxn - 1; *p && --i >= 0; ) {

              oh = T[oh ^ *p];

              h = T[h ^ *(p++)];

    }

    return (oh << 8) + h;

}

注释中说明这两个函数基于Pearson发表的一篇 文章,这篇文章发表于Communications of the ACM archive Volume 33,  Issue 6 (June 1990),文章题目是: Fast hashing of  variable-length text strings。whashstr函数取自于Amylaars driver——另一个mud driver? 这个函数比第一个函数hashstr速度要快一点,因此在MudOS中 全部采用whashstr函数来实现散列表。

我并没有找到Pearson的那篇文章,对于数组T[]的产生也就无从得知,在查找资料(参 考资料3)时找到另一个产生的T的算法和一个散列函数:

  -- use an array of 256 1-byte values

  ub1 x[0..255];

  -- fill it with the values 0 to 255 in some  pseudorandom order

  -- (this is derived from the alleged RC4)

  { for i in 0..255 do

    { x[i] := i;

    }

  }

  k=7;

  { for j in 0..3 do

    { for i in 0..255 do

      { s = x[i];

        k = (k + s) mod 256;

        x[i] = x[k];

        x[k] = s;

      }

    }

  }

  -- use the length of the message to initialize the  hash

  -- XXX can be 0, or some constant, or some argument

  hash := (XXX+len) mod 256;

  -- hash the characters in the message one byte at a  time

  {for i in 0..len-1 do

{hash := (hash +  message[i]) mod 256;  hash := x[hash];}}

将生成T的算法写成c函数:

void gen_T(int* vec)

{

       int i, j, s, k = 7;

       for (i = 0; i < 256; ++i)

       {

              vec[i] = i;

       }

       for (j = 0; j < 4; ++j)

       {

              for (i = 0; i < 256; ++i)

              {

                     s = vec[i];

                     k = (k + s) % 256;

                     vec[i] = vec[k];

                     vec[k] = s;

              }

       }

}

于是,我们有 了另一个散列函数:

static int simulate_T[] =

{

        49,118,63,252,13,155,114,130,137,40,210,62,219,246,136,221,

        174,106,37,227,166,25,139,19,204,212,64,176,70,11,170,58,

        146,24,123,77,184,248,108,251,43,171,12,141,126,41,95,142,

        167,46,178,235,30,75,45,208,110,230,226,50,32,112,156,180,

        205,68,202,203,82,7,247,217,223,71,116,76,6,31,194,183,

        15,102,97,215,234,240,53,119,52,47,179,99,199,8,101,35,

        65,132,154,239,148,51,216,74,93,192,42,86,165,113,89,48,

        100,195,29,211,169,38,57,214,127,117,59,39,209,88,1,134,

        92,163,0,66,237,22,164,200,85,9,190,129,111,172,231,14,

        181,206,128,23,187,73,149,193,241,236,197,159,55,125,196,60,

        161,238,245,94,87,157,122,158,115,207,17,20,145,232,107,16,

        21,185,33,225,175,253,81,182,67,243,69,220,153,5,143,3,

        26,213,147,222,105,188,229,191,72,177,250,135,152,121,218,44,

        120,140,138,28,84,186,198,131,54,2,56,78,173,151,83,27,

        255,144,249,189,104,4,168,98,162,150,254,242,109,34,133,224,

        228,79,103,201,160,90,18,61,10,233,91,80,124,96,244,36           

};

int simulate_hashstr(char * s, int len)

{

       int h = len % 256;

       for (int i = 0; i < len; ++i)

       {

              h = (h + s[i]) % 256;

              h = simulate_T[h];

       }

       return h;

}

显然,这个散 列函数生成的值在0~255之间。

处理字符串的散列函数还有很多,详见参 考资料3。

使用散列函数寻址

       在MudOS源码(v22.2b14)中,定义了5个宏或函数分别处理应用中的各个散列表的寻址操作,这些宏包装了whashstr函数以方便使用:

1.          

/* lex.h */

#define DEFHASH 64

#define defhash(s) (whashstr((s), 10) & (DEFHASH - 1))

2.          

/* lex.c */

#define IDENT_HASH_SIZE 1024

#define IdentHash(s) (whashstr((s), 20) & (IDENT_HASH_SIZE - 1))

3.        

/* object.c */

/* CFG_LIVING_HASH_SIZE定义于 options.h

   #define CFG_LIVING_HASH_SIZE     256 */

INLINE_STATIC int hash_living_name P1(char *, str)

{

    return whashstr(str, 20) & (CFG_LIVING_HASH_SIZE - 1);

}

4.         

/* otable.c */

#define ObjHash(s) whashstr(s, 40) & otable_size_minus_one

5.         

/* stralloc.c */

#define StrHash(s) (whashstr((s), 20) & (htable_size_minus_one))

    对于1、2、3,参数和 函数功能已经很明显:萃取(extract)低位字节作为被评估(evaluated) 的字符串所在散列表的位置,对应的散列表的大小为DEFHASH,IDENT_HASH_SIZE,CFG_LIVING_HASH_SIZE,由于使用的“&” 操作(按位与)来萃取低位字节,因此,需要将这些散列表的大小置为2的指数。

    对于第4个宏,参数otable_size_minus_one在 otable.c中 被赋值。

/* otable.c */

/*

 * hash table - list of pointers to heads of object chains.

 * Each object in chain has a pointer, next_hash, to the next object.

 */

static object_t **obj_table = 0;

void init_otable()

{

    int x, y;

    /* ensure that  otable_size is a power of 2 */

    y = OTABLE_SIZE;

    for (otable_size = 1; otable_size < y; otable_size *= 2)

       ;

    otable_size_minus_one = otable_size - 1;

    obj_table = CALLOCATE(otable_size, object_t *,

                       TAG_OBJ_TBL, "init_otable");

    for (x = 0; x < otable_size; x++)

       obj_table[x] = 0;

}

    宏OTABLE_SIZE的功能为存储配置文件中object table size的值,在MudOS启动时,在main函数里调用读取配置文件的函数,对OTABLE_SIZE进 行赋值,在init_otalbe函数中将这个值调整为2的 指数,并将这个值赋给otable_size_minus_one。

    对于第5个宏,htable_size_minus_one的 赋值过程与otable_size_minus_one大同小异。但是在查看某个mudlib的配置文件时发现下面的问题:

/* 取自config.xxx */

# Define  the size of the shared string hash table.  This number should

# a prime, probably between 1000 and 30000; if you set it to about 1/5

# of the  number of distinct strings you have, you will get a hit ratio

# (number  of comparisons to find a string) very close to 1, as found strings

# are  automatically moved to the head of a hash chain.  You will never

# need  more, and you will still get good results with a smaller table.

hash table size : 7001

    注释中(以#开头的语句),说明hash table size必须为质数(prime),然而源代 码中却需要保证这个值为2的指数。我想出现前后矛盾的原因大概和MudOS版 本差异有关:比较早的版本使用的是hashstr 函数 ,这个函数在后续的寻址操作中用的是“%” 求模运算,而使用质数作为hash table的大小在解决碰撞的效率上要好些(猜测如此,未经求证),因此这些mudlib的config文件就一直沿用这些传统;目前比较新的MudOS版本采用的是whashstr 函数,其后续的寻址操作——正如之前介绍的,用的是“&”按位与运算,这 必然要求hash table的大小必须为2的指数。

散列表的实现

    有了前面分析的散列函数和宏,本文将以object table为 例,来实现一个hash table,object table实 现的源码参见最新发布的MudOS源码 (v22.2b14)中的otable.c。下面这些代码与源码有差异,这么做的 原因是为了不牵涉到具体应用,而仅仅是单纯的实现一个hash table。

/* hashtable.h */

#if !defined(_HASH_TABLE_H_)

#define _HASH_TABLE_H_

#define STRING_SIZE 64

typedef struct object_s {

       char name[STRING_SIZE];

       struct object_s *next_hash;

       char value[STRING_SIZE];

} object_t;

void init_otable();

void clear_otable();

void enter_object_hash(object_t *);

void remove_object_hash(object_t *);

object_t *lookup_object_hash(char *);

void dump();

#endif     // _HASH_TABLE_H_

/* hashtable.c */

#include "stdafx.h"

#include "stdlib.h"

#include "stdio.h"

#include "string.h"

#include "hashtable.h"

#define BUCKET_SIZE 256    

static int otable_size;

static int otable_size_minus_one;

int whashstr(char *, int);

#define ObjHash(s) whashstr((char *)s, 40) & otable_size_minus_one

/*

* hash table - list of  pointers to heads of object chains.

* Each object in chain  has a pointer, next_hash, to the next object.

*/

static object_t **obj_table = 0;

void init_otable()

{

    int y;

    /* ensure that  otable_size is a power of 2 */

    y = BUCKET_SIZE;

    for (otable_size = 1; otable_size < y; otable_size *= 2)

              ;

    otable_size_minus_one = otable_size - 1;

       /* by using calloc,  each element in obj_table is initialized to 0 */

    obj_table = (object_t **)calloc(otable_size, sizeof(object_t *));

}

void clear_otable()

{

       int i;

       free(obj_table);

       for (i = 0; i < BUCKET_SIZE; ++i) {

              obj_table[i] = 0;

       }

}

/* A global. */

static int h;

/*

* Looks for obj in  table, moves it to head.

*/

object_t *find_obj_n(char * s)

{

    object_t *curr, *prev;

    h = ObjHash(s);

    curr = obj_table[h];

    prev = 0;

    while (curr) {

              if (!strcmp(curr->name, s)) {      /* found it */

                     if (prev) {             /* not at head of list */

                            prev->next_hash = curr->next_hash;

                            curr->next_hash = obj_table[h];

                            obj_table[h] = curr;

                     }

                     return (curr);  /* pointer to object */

              }

              prev = curr;

              curr = curr->next_hash;

    }

    return (0);                   /* not  found */

}

/*

* Add an object to the  table - can't have duplicate names.

*/

void enter_object_hash(object_t * ob)

{

    object_t *s;

    s = find_obj_n(ob->name); /* This sets h */

    ob->next_hash = obj_table[h];

    obj_table[h] = ob;

    return;

}

/*

* Remove an object from the table - generally called when it

* is removed from the  next_all list - i.e. in destruct.

*/

void remove_object_hash(object_t * ob)

{

    object_t *s;

    s = find_obj_n(ob->name); /* this sets h, and  cycles the ob to the front */

    obj_table[h] = ob->next_hash;

    ob->next_hash = 0;

    return;

}

/*

* Lookup an object in  the hash table; if it isn't there, return null.

* This is only  different to find_object_n in that it collects different

* stats; more finds are actually done than the user ever asks for.

*/

object_t *lookup_object_hash(char * s)

{

    return find_obj_n(s);

}

/*

* Display the sum of  elements in obj_table.

*/

void dump()

{

       int i, count;

       object_t * curr;

       for (i = 0; i < BUCKET_SIZE; ++i) {

              count = 0;

              curr = obj_table[i];

              while (curr) {

                     ++count;

                     curr = curr->next_hash;

              }

              printf("%d\n", count);

       }

}

/* Hash function. */

static int T[] =

{

    1, 87, 49, 12, 176, 178, 102, 166, 121, 193, 6, 84, 249, 230, 44, 163,

              14, 197,  213, 181, 161, 85, 218, 80, 64, 239, 24, 226, 236, 142, 38, 200,

              110, 177, 104, 103, 141, 253, 255, 50, 77, 101, 81, 18, 45, 96, 31, 222,

              25, 107,  190, 70, 86, 237, 240, 34, 72, 242, 20, 214, 244, 227, 149, 235,

              97, 234,  57, 22, 60, 250, 82, 175, 208, 5, 127, 199, 111, 62, 135, 248,

              174, 169, 211, 58, 66, 154, 106, 195, 245, 171, 17, 187, 182, 179, 0, 243,

              132, 56,  148, 75, 128, 133, 158, 100, 130, 126, 91, 13, 153, 246, 216, 219,

              119, 68,  223, 78, 83, 88, 201, 99, 122, 11, 92, 32, 136, 114, 52, 10,

              138, 30,  48, 183, 156, 35, 61, 26, 143, 74, 251, 94, 129, 162, 63, 152,

              170, 7,  115, 167, 241, 206, 3, 150, 55, 59, 151, 220, 90, 53, 23, 131,

              125, 173, 15, 238, 79, 95, 89, 16, 105, 137, 225, 224, 217, 160, 37, 123,

              118, 73,  2, 157, 46, 116, 9, 145, 134, 228, 207, 212, 202, 215, 69, 229,

              27, 188,  67, 124, 168, 252, 42, 4, 29, 108, 21, 247, 19, 205, 39, 203,

              233, 40,  186, 147, 198, 192, 155, 33, 164, 191, 98, 204, 165, 180, 117, 76,

              140, 36,  210, 172, 41, 54, 159, 8, 185, 232, 113, 196, 231, 47, 146, 120,

              51, 65,  28, 144, 254, 221, 93, 189, 194, 139, 112, 43, 71, 109, 184, 209,

};

int whashstr(char * s, int maxn)

{

    register unsigned char oh, h;

    register unsigned char *p;

    register int i;

       if(!s)return 0;

    if (!*s)

              return 0;

    p = (unsigned char *) s;

    oh = T[*p];

    h = (*(p++) + 1) & 0xff;

    for (i = maxn - 1; *p && --i >= 0; ) {

              oh = T[oh ^ *p];

              h = T[h ^ *(p++)];

    }

    return (oh << 8) + h;

}

这份代码实现了一个hashtable的基本操作:插入,删除,查找。为了测试这个hashtable, 我写了一份非常简单的测试代码,测试数据取自于http://www.graphcomp.com/info/mud/mudos/History_MudOS.html,为了简化代码,我用python对这篇文章进行 了一下处理,首先保存这篇文章的内容为: history_mudos.txt,我把它存放在E盘根目录,python处理源码如下:

#  preread.py

import re

def foo  ():

    rex =  re.compile('[A-Za-z0-9]+')

    f =  open('e:\out.txt','w')

    for  line in open('e:\histroy_mudos.txt','r'):

         for word in rex.findall(line):

            f.write(word)

            f.write('\n')

在python控 制台下运行这些代码,获得处理过的文件out.txt,out.txt中 将包含history_mudos.txt中的所有单词,每个单词按行排列。有了经过预处理的out.txt,编写测试代码就简单得多了,为了更加简化操作,下面的代码将使用c++来 编写:

/* main.cpp */

#include <iostream>

#include <fstream>

#include

#include <string>

#include <vector>

#include <set>

#include "hashtable.h"

using namespace std;

static vector<object_t *> ob_table;

struct constructor {

       operator () (const string& str)

       {

              object_t * pob = new object_t;

              strcpy(pob->name, str.c_str());

              strcpy(pob->value, strupr(const_cast<char*>(str.c_str())));

              pob->next_hash = 0;

              ob_table.push_back(pob);

       }

};

struct destructor {

       operator () (object_t * pob)

       {

              delete pob;

              pob = 0;

       }

};

struct ob_inserter {

       operator () (object_t * pob)

       {

              enter_object_hash(pob);

       }

};

void create()

{

       ifstream infile;

       string str;

       infile.open("E:\\out.txt");

       set<string> word_set;

       while (getline(infile, str)) {

              word_set.insert(str);

       }

       for_each(word_set.begin(), word_set.end(), constructor());

       infile.close();

       init_otable();

       for_each(ob_table.begin(), ob_table.end(), ob_inserter());

}

void clear()

{

       for_each(ob_table.begin(), ob_table.end(), destructor());

       clear_otable();

}

void lookup(char* s)

{

       object_t * pob = lookup_object_hash(s);

       if (pob)

              cout << pob->value << endl;

}

int main(void)

{

       create();

       lookup("the");

       lookup("history");

       lookup("of");

       lookup("MudOS");

       dump();

       clear();

}

编译运行,结果如下:

THE

OF

MUDOS

dump函数统计个数见下图,总共651个点,对于散列大 小为256而言,这样的分布还是满足要求的。

结果证明这份hash table的实现代码是正确的。

参考资料:

[1]    Sartaj  Sahni. Data Structures, Algorithms, and Applications in C++

[2]    侯捷. 《STL源码分析》

[3]    http://burtleburtle.net/bob/hash/index.html

[4]    http://www.mudos.org/


« 上一篇下一篇 »

发表评论:

◎欢迎参与讨论,请在这里发表您的看法、交流您的观点。