Lua5.1源码分析——Table

lazy ♦ 8月 16, 2014 ♦ Leave Your Comment

数据结构

Table 的定义在 lobject.h 中。

/*
** Tables
*/

typedef union TKey {
  struct {
    TValuefields;
    struct Node *next;  /* for chaining */
  } nk;
  TValue tvk;
} TKey;

//hash部分的节点
typedef struct Node {
  TValue i_val;
  TKey i_key;
} Node;


typedef struct Table {
  CommonHeader;
  lu_byte flags;  /* 1<<p means tagmethod(p) is not present */ 
  lu_byte lsizenode;  /* log2 of size of `node' array */
  struct Table *metatable;
  TValue *array;  /* array part */
  Node *node;  //hash part
  Node *lastfree;  /* any free position is before this position */
  GCObject *gclist;
  int sizearray;  /* size of `array' array */
} Table;

Table中的数据存储分为两部分，数组部分array和hash部分node。数组部分就是一个TValue的C数组，而hash部分则是一个hash表，每个节点Node包括一组键值对，Table中的node变量记录hash表头。注意Node中的Key部分比较奇怪，这是一个union，里面可能放着一个TValue,也可能是一个TValue+Node,如果有TKey a, 那么a.tvk.value == a.nk.value; a.tvk.tt == a.nk.tt.不太明白为什么不直接用一个struct里面放TValue和Node，而要用一个union。

ltable.h中定义了几个存取变量用的宏。

//(Node*) Table.node[i] 
#define gnode(t,i)	(&(t)->node[i])
//(TValue+Node*) Node.i_key.nk
#define gkey(n)		(&(n)->i_key.nk)
//(TValue) Node.i_val
#define gval(n)		(&(n)->i_val)
//(Node*) Node.nk.next
#define gnext(n)	((n)->i_key.nk.next)
//(TValue) Node.i_key.tvk
#define key2tval(n)	(&(n)->i_key.tvk)

new & free

//一个空Node,用来减小创建空hash表的开销
#define dummynode		(&dummynode_)

static const Node dummynode_ = {
  {{NULL}, LUA_TNIL},  /* value */
  {{{NULL}, LUA_TNIL, NULL}}  /* key */
};

//new Table, narray为数组部分的大小，nhash为hash部分的大小
Table *luaH_new (lua_State *L, int narray, int nhash) {
  Table *t = luaM_new(L, Table); //申请内存
  luaC_link(L, obj2gco(t), LUA_TTABLE); //将 t 连接到 global_state->rootgc (所有gc对象链表)上
  t->metatable = NULL; 
  t->flags = cast_byte(~0);
  /* temporary values (kept only if some malloc fails) */
  t->array = NULL; 
  t->sizearray = 0;
  t->lsizenode = 0;
  t->node = cast(Node *, dummynode);
  setarrayvector(L, t, narray);  //初始化 array 部分
  setnodevector(L, t, nhash); //初始化 hash 部分
  return t;
}

static void setarrayvector (lua_State *L, Table *t, int size) {
  int i;
  //分配内存
  luaM_reallocvector(L, t->array, t->sizearray, size, TValue);  
  for (i=t->sizearray; i<size; i++)
     setnilvalue(&t->array[i]);   //初始化数组每个元素为nil
  t->sizearray = size; //更新大小
}

static void setnodevector (lua_State *L, Table *t, int size) {
  int lsize;
  if (size == 0) {  /* no elements to hash part? */
    //如果大小是0就直接等于dummynode
    t->node = cast(Node *, dummynode);  /* use common `dummynode' */
    lsize = 0;
  }
  else {
    int i;
    lsize = ceillog2(size);  //lsize = int( log2(size-1) ) + 1  , size <= 2^lsize 
    if (lsize > MAXBITS)
      luaG_runerror(L, "table overflow");
    size = twoto(lsize);  //实际分配大小 2^lsize
    t->node = luaM_newvector(L, size, Node);   //申请内存
    for (i=0; i<size; i++) {  //遍历初始化
      Node *n = gnode(t, i);
      gnext(n) = NULL;
      setnilvalue(gkey(n));
      setnilvalue(gval(n));
    }
  }
  t->lsizenode = cast_byte(lsize);  //lsizenode = unsigned char (log2 (size)) 
  t->lastfree = gnode(t, size);  /* all positions are free 指向最后一个节点*/  
}

//free函数很简单,只是释放 array ,hash 内存和 Table 结构本身的内存
void luaH_free (lua_State *L, Table *t) {
  if (t->node != dummynode)
    luaM_freearray(L, t->node, sizenode(t), Node);
  luaM_freearray(L, t->array, t->sizearray, TValue);
  luaM_free(L, t);
}

ltable.c中最复杂的部分在于newkey这个函数,这个函数负责在hash表部分新建一个键值,并返回对应Node的value.每个key都可以得到一个hash值h,这个h所在的hash表的位置叫做这个key的mainposition主键位,如果主键位已经有别的Node othern占用,就判断othern是不是在它的主键位,如果是,说明这个键位存在冲突,把正在创建的node放到一个别的空位置(getfreepos)去,如果othern不在它的主键位,就把othern放到freepos去,把正在创建的node放到该主键位.比起string的hash表,使用这样的hash可以使内存的利用更紧凑,分配的内存都用光后才进行rehash.

/*
** inserts a new key into a hash table; first, check whether key's main 
** position is free. If not, check whether colliding node is in its main 
** position or not: if it is not, move colliding node to an empty place and 
** put new key in its main position; otherwise (colliding node is in its main 
** position), new key goes to an empty position. 
*/
static TValue *newkey (lua_State *L, Table *t, const TValue *key) {
  Node *mp = mainposition(t, key);  //得到主键位node
  if (!ttisnil(gval(mp)) || mp == dummynode) { //如果主键位被占用
    Node *othern;
    Node *n = getfreepos(t);  /* get a free place */
    if (n == NULL) {  /* cannot find a free place? */
      rehash(L, t, key);  /* grow table */
      return luaH_set(L, t, key);  /* re-insert key into grown table */
    }
    lua_assert(n != dummynode);
    othern = mainposition(t, key2tval(mp));  //计算othern的主键位
    if (othern != mp) {  /* is colliding node out of its main position? */
      /* yes; move colliding node into free position */
      while (gnext(othern) != mp) othern = gnext(othern);  /* find previous */
      gnext(othern) = n;  /* redo the chain with `n' in place of `mp' */
      *n = *mp;  /* copy colliding node into free pos. (mp->next also goes) */
      gnext(mp) = NULL;  /* now `mp' is free */
      setnilvalue(gval(mp));
    }
    else {  /* colliding node is in its own main position */
      /* new node will go into free position */
      gnext(n) = gnext(mp);  /* chain new position */
      gnext(mp) = n;
      mp = n;
    }
  }
//设置新node的key值
  gkey(mp)->value = key->value; gkey(mp)->tt = key->tt;
//some gc management
  luaC_barriert(L, t, key);
  lua_assert(ttisnil(gval(mp)));
  return gval(mp);  //返回新node的value
}

getfreepos函数就是从后往前找空位置

static Node *getfreepos (Table *t) {
  while (t->lastfree-- > t->node) {
    if (ttisnil(gkey(t->lastfree)))
      return t->lastfree;
  }
  return NULL;  /* could not find a free place */
}

mainposition函数则是对不同类型的TValue进行hash,返回对应的node.

/*
** returns the `main' position of an element in a table (that is, the index
** of its hash value)
*/
static Node *mainposition (const Table *t, const TValue *key) {
  switch (ttype(key)) {
    case LUA_TNUMBER:
      return hashnum(t, nvalue(key));
    case LUA_TSTRING:
      return hashstr(t, rawtsvalue(key));
    case LUA_TBOOLEAN:
      return hashboolean(t, bvalue(key));
    case LUA_TLIGHTUSERDATA:
      return hashpointer(t, pvalue(key));
    default:
      return hashpointer(t, gcvalue(key));
  }
}

/*
** hash for lua_Numbers
*/
static Node *hashnum (const Table *t, lua_Number n) {
  unsigned int a[numints];
  int i;
  n += 1;  /* normalize number (avoid -0) */
  lua_assert(sizeof(a) <= sizeof(n));
  memcpy(a, &n, sizeof(a));
  for (i = 1; i < numints; i++) a[0] += a[i];
  return hashmod(t, a[0]);
}

#define hashpow2(t,n)      (gnode(t, lmod((n), sizenode(t))))
  
#define hashstr(t,str)  hashpow2(t, (str)->tsv.hash)
#define hashboolean(t,p)        hashpow2(t, p)
/*
** for some types, it is better to avoid modulus by power of 2, as
** they tend to have many 2 factors.
*/
#define hashmod(t,n)	(gnode(t, ((n) % ((sizenode(t)-1)|1))))

#define hashpointer(t,p)	hashmod(t, IntPoint(p))

Set & Get

set & get 主要有下面几个函数.

LUAI_FUNC const TValue *luaH_getnum (Table *t, int key);
LUAI_FUNC TValue *luaH_setnum (lua_State *L, Table *t, int key);
LUAI_FUNC const TValue *luaH_getstr (Table *t, TString *key);
LUAI_FUNC TValue *luaH_setstr (lua_State *L, Table *t, TString *key);
LUAI_FUNC const TValue *luaH_get (Table *t, const TValue *key);
LUAI_FUNC TValue *luaH_set (lua_State *L, Table *t, const TValue *key);

主要逻辑在get函数中,set函数只是调用get然后赋值

/*
** main search function
*/
const TValue *luaH_get (Table *t, const TValue *key) {
  switch (ttype(key)) {
    case LUA_TNIL: return luaO_nilobject;  //没有nil key,返回一个全局空对象
    case LUA_TSTRING: return luaH_getstr(t, rawtsvalue(key));
    case LUA_TNUMBER: {
      int k;
      lua_Number n = nvalue(key);
      lua_number2int(k, n);  //将double转换为int
      if (luai_numeq(cast_num(k), nvalue(key))) /* index is int? */
          //整数key才调用下面的函数,否则到下面查找hash表部分
          return luaH_getnum(t, k);  /* use specialized version */
      /* else go through */
    }
    default: {
      Node *n = mainposition(t, key);
      do {  /* check whether `key' is somewhere in the chain */
        if (luaO_rawequalObj(key2tval(n), key))
          return gval(n);  /* that's it */
        else n = gnext(n);
      } while (n);  //遍历主键位链表
      return luaO_nilobject;
    }
  }
}
/*
** search function for integers
*/
const TValue *luaH_getnum (Table *t, int key) {
  /* (1 <= key && key <= t->sizearray) */
  //先查找array部分
  if (cast(unsigned int, key-1) < cast(unsigned int, t->sizearray))
    return &t->array[key-1];
  else { //查找hash部分
    lua_Number nk = cast_num(key);
    Node *n = hashnum(t, nk);
    do {  /* check whether `key' is somewhere in the chain */
      if (ttisnumber(gkey(n)) && luai_numeq(nvalue(gkey(n)), nk))
        return gval(n);  /* that's it */
      else n = gnext(n);
    } while (n);
    return luaO_nilobject;
  }
}
/*
** search function for strings
*/
//逻辑基本和get函数的defaut处理相同,不知道为什么单独处理
const TValue *luaH_getstr (Table *t, TString *key) {
  Node *n = hashstr(t, key);
  do {  /* check whether `key' is somewhere in the chain */
    if (ttisstring(gkey(n)) && rawtsvalue(gkey(n)) == key)
      return gval(n);  /* that's it */
    else n = gnext(n);
  } while (n);
  return luaO_nilobject;
}

set函数不处理具体的赋值,只是调用get函数,如果没有对应的键,则新建一个node并返回对应的value,赋值操作由调用者完成.

TValue *luaH_set (lua_State *L, Table *t, const TValue *key) {
  const TValue *p = luaH_get(t, key);
  t->flags = 0;
  if (p != luaO_nilobject)
    return cast(TValue *, p);
  else { 
    //这里感觉写啰嗦了, ttisnil(nilobject) 一定为true
    if (ttisnil(key)) luaG_runerror(L, "table index is nil");
    else if (ttisnumber(key) && luai_numisnan(nvalue(key)))
      luaG_runerror(L, "table index is NaN");
    return newkey(L, t, key);
  }
}


TValue *luaH_setnum (lua_State *L, Table *t, int key) {
  const TValue *p = luaH_getnum(t, key);
  if (p != luaO_nilobject)
    return cast(TValue *, p);
  else {
    TValue k;
    setnvalue(&k, cast_num(key));
    return newkey(L, t, &k);
  }
}


TValue *luaH_setstr (lua_State *L, Table *t, TString *key) {
  const TValue *p = luaH_getstr(t, key);
  if (p != luaO_nilobject)
    return cast(TValue *, p);
  else {
    TValue k;
    setsvalue(L, &k, key);
    return newkey(L, t, &k);
  }
}

Rehash

//newkey 空间不足时调用该函数
static void rehash (lua_State *L, Table *t, const TValue *ek) {
  int nasize, na; 
  //num[i]记录从2^(i-1)到2^i的key数量,用来计算最终分配的array大小
  int nums[MAXBITS+1];  /* nums[i] = number of keys between 2^(i-1) and 2^i */
  int i;
  int totaluse;
  for (i=0; i<=MAXBITS; i++) nums[i] = 0;  /* reset counts */
  //计算array部分用掉的key数量,同时更新nums
  nasize = numusearray(t, nums);  /* count keys in array part */
  totaluse = nasize;  /* all those keys are integer keys */
  //总共的key数量还要加上hash部分的key数量,该函数也更新nums和nasize
  totaluse += numusehash(t, nums, &nasize);  /* count keys in hash part */
  /* count extra key */
  //还要加上传入的ek
  //如果ek是整数,countint返回1并更新nums,否则返回0
  nasize += countint(ek, nums); 
  totaluse++;
  /* compute new size for array part */
  //到这里,nasize为所有整数key数量,totaluse是所有key数量
  //computesizes计算应该分配的数组大小
  //返回值为新数组部分的key数量,更新nasize为新数组大小
  na = computesizes(nums, &nasize);
  /* resize the table to new computed sizes */
  resize(L, t, nasize, totaluse - na);
}

//计算数组部分key数量
static int numusearray (const Table *t, int *nums) {
  int lg;
  int ttlg;  /* 2^lg */
  int ause = 0;  /* summation of `nums' */
  int i = 1;  /* count to traverse all array keys */
  //每次乘2,分段计算key数量
  for (lg=0, ttlg=1; lg<=MAXBITS; lg++, ttlg*=2) {  /* for each slice */
    int lc = 0;  /* counter */
    int lim = ttlg;
    if (lim > t->sizearray) {
      lim = t->sizearray;  /* adjust upper limit 最大计算到 sizearray*/
      if (i > lim) //如果上个for循环达到lim则break结束循环
        break;  /* no more elements to count */
    }
    /* count elements in range (2^(lg-1), 2^lg] */
    for (; i <= lim; i++) {
      if (!ttisnil(&t->array[i-1]))
        lc++;
    }
    nums[lg] += lc; //更新nums
    ause += lc; //更新key总数
  }
  return ause;
}

//计算hash部分key数量.更新pnasize和nums
static int numusehash (const Table *t, int *nums, int *pnasize) {
  int totaluse = 0;  /* total number of elements */
  int ause = 0;  /* summation of `nums' */
  int i = sizenode(t);
  while (i--) {  //倒着遍历整个hash表
    Node *n = &t->node[i];
    if (!ttisnil(gval(n))) {
      //如果是int则数组key数量+1,更新nums
      ause += countint(key2tval(n), nums);
      totaluse++;
    }
  }
  *pnasize += ause;  //更新pnasize
  return totaluse; //返回hash所有key数量
}

static int countint (const TValue *key, int *nums) {
  int k = arrayindex(key);
  if (0 < k && k <= MAXASIZE) {  /* is `key' an appropriate array index? */
    nums[ceillog2(k)]++;  /* count as such */
    return 1;
  }
  else
    return 0;
}

/*
** returns the index for `key' if `key' is an appropriate key to live in
** the array part of the table, -1 otherwise.
*/
static int arrayindex (const TValue *key) {
  if (ttisnumber(key)) {
    lua_Number n = nvalue(key);
    int k;
    lua_number2int(k, n);
    if (luai_numeq(cast_num(k), n)) //key为整数
      return k; //返回整数
  }
  return -1;  /* `key' did not match some condition */
}

//计算新分配的数组大小,返回值为新数组部分的key数量,更新narray为新数组大小
static int computesizes (int nums[], int *narray) {
  int i;
  int twotoi;  /* 2^i */
  int a = 0;  /* number of elements smaller than 2^i */
  int na = 0;  /* number of elements to go to array part */
  int n = 0;  /* optimal size for array part */
  for (i = 0, twotoi = 1; twotoi/2 < *narray; i++, twotoi *= 2) {
    if (nums[i] > 0) {
      a += nums[i];
      //如果小于2^i的key数量 > 2^i / 2,就更新数组的大小为2^i
      if (a > twotoi/2) {  /* more than half elements present? */
        n = twotoi;  /* optimal size (till now) */
        na = a;  /* all elements smaller than n will go to array part */
      }
    }
    if (a == *narray) break;  /* all elements already counted */
  }
  *narray = n;
  lua_assert(*narray/2 <= na && na <= *narray);
  return na;
}

由上面的函数可以看出,数组部分大小的分配原则是要有一半以上的空间利用,因此如果表里的int key值很大又不多的话,会被分配到hash部分去,而不会在array部分分配很大空间.

最后是两个resize函数.

static void resize (lua_State *L, Table *t, int nasize, int nhsize) {
  int i;
  int oldasize = t->sizearray;
  int oldhsize = t->lsizenode;
  Node *nold = t->node;  /* save old hash ... */
  if (nasize > oldasize)  /* array part must grow? */
    setarrayvector(L, t, nasize);
  /* create new hash part with appropriate size */
  setnodevector(L, t, nhsize);  
  if (nasize < oldasize) {  /* array part must shrink? */
    t->sizearray = nasize;
    /* re-insert elements from vanishing slice */
    //如果新array变小了,就把缩小部分的数据塞到hash表里,所以前面要先初始化hash表
    for (i=nasize; i<oldasize; i++) {
      if (!ttisnil(&t->array[i]))  
        //t.node[i+1] = t.array[i]  hash表中的key与lua里一样
        setobjt2t(L, luaH_setnum(L, t, i+1), &t->array[i]);
    }
    /* shrink array */
    luaM_reallocvector(L, t->array, oldasize, nasize, TValue);
  }
  /* re-insert elements from hash part */
  for (i = twoto(oldhsize) - 1; i >= 0; i--) {
    Node *old = nold+i;
    if (!ttisnil(gval(old)))
      setobjt2t(L, luaH_set(L, t, key2tval(old)), gval(old));
  }
  if (nold != dummynode)
    luaM_freearray(L, nold, twoto(oldhsize), Node);  /* free old array */
}


void luaH_resizearray (lua_State *L, Table *t, int nasize) {
  int nsize = (t->node == dummynode) ? 0 : sizenode(t);
  resize(L, t, nasize, nsize);
}

Search

剩下的几个函数都归到search里吧

/*
** returns the index of a `key' for table traversals. First goes all
** elements in the array part, then elements in the hash part. The
** beginning of a traversal is signalled by -1.
*/
static int findindex (lua_State *L, Table *t, StkId key) {
  int i;
  if (ttisnil(key)) return -1;  /* first iteration */
  i = arrayindex(key);
  if (0 < i && i <= t->sizearray)  /* is `key' inside array part? */
    return i-1;  /* yes; that's the index (corrected to C) */
  else {
    Node *n = mainposition(t, key);
    do {  /* check whether `key' is somewhere in the chain */
      /* key may be dead already, but it is ok to use it in `next' */
      if (luaO_rawequalObj(key2tval(n), key) ||
            (ttype(gkey(n)) == LUA_TDEADKEY && iscollectable(key) &&
             gcvalue(gkey(n)) == gcvalue(key))) { //这个条件我还不太懂
        i = cast_int(n - gnode(t, 0));  /* key index in hash table */
        /* hash elements are numbered after array ones */
        return i + t->sizearray;
      }
      else n = gnext(n);
    } while (n);
    luaG_runerror(L, "invalid key to " LUA_QL("next"));  /* key not found */
    return 0;  /* to avoid warnings */
  }
}

typedef TValue *StkId;  /* index to stack elements */

//该函数传入一个key,然后返回下个键值对
int luaH_next (lua_State *L, Table *t, StkId key) {
  int i = findindex(L, t, key);  /* find original element */
  for (i++; i < t->sizearray; i++) {  /* try first array part */
    if (!ttisnil(&t->array[i])) {  /* a non-nil value? */
      setnvalue(key, cast_num(i+1));
      setobj2s(L, key+1, &t->array[i]);
      return 1;
    }
  }
  for (i -= t->sizearray; i < sizenode(t); i++) {  /* then hash part */
    if (!ttisnil(gval(gnode(t, i)))) {  /* a non-nil value? */
      setobj2s(L, key, key2tval(gnode(t, i)));
      setobj2s(L, key+1, gval(gnode(t, i)));
      return 1;
    }
  }
  return 0;  /* no more elements */
}
//上面那个函数只在 lua_next 这个api函数中用到,结合起来看好懂一些
LUA_API int lua_next (lua_State *L, int idx) {
  StkId t;
  int more;
  lua_lock(L);
  t = index2adr(L, idx);
  api_check(L, ttistable(t));
  more = luaH_next(L, hvalue(t), L->top - 1);
  if (more) {
    api_incr_top(L);
  }
  else  /* no more elements */
    L->top -= 1;  /* remove key */
  lua_unlock(L);
  return more;
}

getn函数.脚本里#操作符和getn函数最后会调用改函数

/*
** Try to find a boundary in table `t'. A `boundary' is an integer index
** such that t[i] is non-nil and t[i+1] is nil (and 0 if t[1] is nil).
*/
//这个函数返回最大整数key,注意这个函数使用二分法查找,所以如果中间的数据不连续,
//很可能得到不正确的结果
int luaH_getn (Table *t) {
  unsigned int j = t->sizearray;
  if (j > 0 && ttisnil(&t->array[j - 1])) {
    /* there is a boundary in the array part: (binary) search for it */
    unsigned int i = 0;
    while (j - i > 1) {
      unsigned int m = (i+j)/2;
      if (ttisnil(&t->array[m - 1])) j = m;
      else i = m;
    }
    return i;
  }
  /* else must find a boundary in hash part */
  else if (t->node == dummynode)  /* hash part is empty? */
    return j;  /* that is easy... */
  else return unbound_search(t, j);
}

static int unbound_search (Table *t, unsigned int j) {
  unsigned int i = j;  /* i is zero or a present index */
  j++;
  /* find `i' and `j' such that i is present and j is not */
  //找到一个i使  i < max < 2i
  while (!ttisnil(luaH_getnum(t, j))) {
    i = j;
    //查找最大值的范围每次循环扩大一倍[i, 2i]
    j *= 2; 
    if (j > cast(unsigned int, MAX_INT)) {  /* overflow? */
      /* table was built with bad purposes: resort to linear search */
      //如果扩大到MAX_INT还找不到,就会遍历整个表,效率很低
      i = 1;
      while (!ttisnil(luaH_getnum(t, i))) i++;
      return i - 1;
    }
  }
  /* now do a binary search between them */
  while (j - i > 1) {
    unsigned int m = (i+j)/2;
    if (ttisnil(luaH_getnum(t, m))) j = m;
    else i = m;
  }
  return i;
}

Posted in: Lua

Lua5.1源码分析——Table

数据结构

new & free

Set & Get

Rehash

Search

近期文章

文章归档

分类目录

友情链接

功能

Lua5.1源码分析——Table

数据结构

new & free

Set & Get

Rehash

Search

近期文章

文章归档

分类目录

标签

友情链接

功能