文字列のハッシュ値の計算

#algorithm

ハッシュの概念はわかる、けど実際のところどんなもんかよくわかってない…。特に文字列のハッシュ値をどうやって計算してるのかと、ハッシュ値が衝突したときにどうするのかと、バッファのサイズをどうするのか。なのでいくつかの処理系を調べてみた。

Gauche

ソースをhashでgrepしたらそれらしい部分が見つかった。

/* General hash function */
u_long Scm_Hash(ScmObj obj)
{
    u_long hashval;
    if (!SCM_PTRP(obj)) {
        SMALL_INT_HASH(hashval, (u_long)SCM_WORD(obj));
        return hashval;
    } else if (SCM_NUMBERP(obj)) {
        return Scm_EqvHash(obj);
    } else if (SCM_STRINGP(obj)) {
        goto string_hash;
    ...

string_hash:
  {
      const char *p;
      const ScmStringBody *b = SCM_STRING_BODY(obj);
      p = SCM_STRING_BODY_START(b);
      STRING_HASH(hashval, p, SCM_STRING_BODY_SIZE(b));
      return hashval;
  }

/*============================================================
 * Hash functions
 */

/* Hash function calculates 32bit hash value from the given object.
   HASH2INDEX macro maps the hash value to the bucket number.
   (On 64 bit architecture, it's OK to calculate 64bit, but the
   upper bits are discarded by HASH2INDEX to maintain compatibility. */

/* For String
 *
 * Usually, "shift+add" scheme for string hasing works well.  But
 * I found that it works well only if you take the lower bits.
 * Unfortunately, we need to take higher bits for multiplicative
 * hashing of integers and addresses.  So, in HASH2INDEX function,
 * I take both lower bits and higher bits.
 */

#define STRING_HASH(hv, chars, size)                                    \
    do {                                                                \
        int i_ = (size);                                                \
        (hv) = 0;                                                       \
        while (i_-- > 0) {                                              \
            (hv) = ((hv)<<5) - (hv) + ((unsigned char)*chars++);        \
        }                                                               \
    } while (0)

h = h * 31 + c
どっかで、Javaも 31 * h + c だと聞いたような
コメントに、ハッシュの上位はあまりよくない、ので HASH2INDEX を使う、と書いてある：

/* HASH2INDEX
   Map a hash value to bucket number.
   We fix the word length to 32bits, since the multiplication
   constant above is fixed. */
#define HASH2INDEX(tabsiz, bits, hashval) \
    (((hashval)+((hashval)>>(32-(bits)))) & ((tabsiz) - 1))

Lua

ltable.cというソースがあるのでそれを見てみる。

/*
## returns the `main' position of an element in a table (that is, the index
## of its hash value)
*/
static Node *mainposition (const Table *t, const TValue *key) {
  switch (ttype(key)) {
    case LUA_TNUMBER:
      return hashnum(t, nvalue(key));
    case LUA_TSTRING:
      return hashstr(t, rawtsvalue(key));
    case LUA_TBOOLEAN:
    ...

#define hashstr(t,str)  hashpow2(t, (str)->tsv.hash)

#define hashpow2(t,n)      (gnode(t, lmod((n), sizenode(t))))

#define gnode(t,i)        (&(t)->node[i])

ここはすでに計算済みの値を参照してるだけっぽい。(str)->tsv.hash に代入しているところを探す：

TString *luaS_newlstr (lua_State *L, const char *str, size_t l) {
  GCObject *o;
  unsigned int h = cast(unsigned int, l);  /* seed */
  size_t step = (l>>5)+1;  /* if string is too long, don't hash all its chars */
  size_t l1;
  for (l1=l; l1>=step; l1-=step)  /* compute hash */
    h = h ^ ((h<<5)+(h>>2)+cast(unsigned char, str[l1-1]));
  ...
}

h = h ^ ((h<<5)+(h>>2) + c)
あまり長い文字列の場合飛ばし飛ばしにするのはいい案かも、この場合は最大31回の計算で済む
文字列の後ろからなめてくのは、後ろの文字ほどかき混ぜられるのでいい案かも

Ruby

intern だろうと目星をつけて、ソースを grep。うへ～たくさん。定義してるところを探すには…。結局のところ、rb_intern3 になるらしい。

ID
rb_intern3(const char *name, long len, rb_encoding *enc)
{
    const char *m = name;
    const char *e = m + len;
    ...

うわっ、長い！追っかけてみて、結局頭の部分

if (st_lookup(global_symbols.sym_id, str, (st_data_t *)&id))
    return id;

でテーブル参照。

int
st_lookup(st_table *table, register st_data_t key, st_data_t *value)
{
    unsigned int hash_val, bin_pos;
    register st_table_entry *ptr;

    if (table->entries_packed) {
        int i;
        for (i = 0; i < table->num_entries; i++) {
            if ((st_data_t)table->bins[i*2] == key) {
                if (value !=0) *value = (st_data_t)table->bins[i*2+1];
                return 1;
            }
        }
        return 0;
    }

    hash_val = do_hash(key, table);
    FIND_ENTRY(table, ptr, hash_val, bin_pos);

    if (ptr == 0) {
        return 0;
    }
    else {
        if (value != 0)  *value = ptr->record;
        return 1;
    }
}

#define do_hash(key,table) (unsigned int)(*(table)->type->hash)((key))

st_tableはハッシュ値の計算を外から設定できるようになってるらしい。global_symbols.sym_id はどうやって初期化してるのか。parse.c:

void
Init_sym(void)
{
    global_symbols.sym_id = st_init_table_with_size(&symhash, 1000);
    ...

static const struct st_hash_type symhash = {
    rb_str_hash_cmp,
    rb_str_hash,
};

int
rb_str_hash(VALUE str)
{
    return hash((const void *)RSTRING_PTR(str), RSTRING_LEN(str), 0);
}

/* MurmurHash described in [http://murmurhash.googlepages.com/](http://murmurhash.googlepages.com/) */
unsigned int
hash(const unsigned char * data, int len, unsigned int h)
{
    const unsigned int m = 0x7fd652ad;
    const int r = 16;

    h += 0xdeadbeef;

    if (len >= 4) {
#if !UNALIGNED_WORD_ACCESS
        int align = (VALUE)data & 3;
        if (align) {
            uint32_t t = 0, d = 0;
            int sl, sr, pack;

            switch (align) {
#ifdef WORDS_BIGENDIAN
              case 1: t |= data[2];
              case 2: t |= data[1] << 8;
              case 3: t |= data[0] << 16;
#else
              case 1: t |= data[2] << 16;
              case 2: t |= data[1] << 8;
              case 3: t |= data[0];
#endif
            }

#ifdef WORDS_BIGENDIAN
            t >>= (8 * align) - 8;
#else
            t <<= (8 * align);
#endif

            data += 4-align;
            len -= 4-align;

            sl = 8 * (4-align);
            sr = 8 * align;

            while (len >= 4) {
                d = *(uint32_t *)data;
#ifdef WORDS_BIGENDIAN
                t = (t << sr) | (d >> sl);
#else
                t = (t >> sr) | (d << sl);
#endif
                h += t;
                h *= m;
                h ^= h >> r;
                t = d;

                data += 4;
                len -= 4;
            }

            pack = len < align ? len : align;
            d = 0;
            switch (pack) {
#ifdef WORDS_BIGENDIAN
              case 3: d |= data[2] << 8;
              case 2: d |= data[1] << 16;
              case 1: d |= data[0] << 24;
              case 0:
                h += (t << sr) | (d >> sl);
#else
              case 3: d |= data[2] << 16;
              case 2: d |= data[1] << 8;
              case 1: d |= data[0];
              case 0:
                h += (t >> sr) | (d << sl);
#endif
                h *= m;
                h ^= h >> r;
            }

            data += pack;
            len -= pack;
        }
        else
#endif
        {
            do {
                h += *(uint32_t *)data;
                h *= m;
                h ^= h >> r;

                data += 4;
                len -= 4;
            } while (len >= 4);
        }
    }

    switch(len) {
#ifdef WORDS_BIGENDIAN
      case 3:
        h += data[2] << 8;
      case 2:
        h += data[1] << 16;
      case 1:
        h += data[0] << 24;
#else
      case 3:
        h += data[2] << 16;
      case 2:
        h += data[1] << 8;
      case 1:
        h += data[0];
#endif
        h *= m;
        h ^= h >> r;
    }

    h *= m;
    h ^= h >> 10;
    h *= m;
    h ^= h >> 17;

    return h;
}

MurmurHash

とりあえずハッシュ値の計算はこんな感じ。

Gauche

Lua

Ruby

関連記事

整列済み配列への挿入位置を二分探索

kd-treeの構築と近傍要素の抽出

【Haskell】ダイクストラ法を実装する

定数による整数除算の高速化【論文読み】

折れ線にフィットするベジェ曲線を求める方法

新着記事