1 : /*
2 : Copyright (C) 2007, Bruce Ediger
3 :
4 : This file is part of cl.
5 :
6 : cl is free software; you can redistribute it and/or modify
7 : it under the terms of the GNU General Public License as published by
8 : the Free Software Foundation; either version 2 of the License, or
9 : (at your option) any later version.
10 :
11 : cl is distributed in the hope that it will be useful,
12 : but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : GNU General Public License for more details.
15 :
16 : You should have received a copy of the GNU General Public License
17 : along with cl; if not, write to the Free Software
18 : Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 :
20 : */
21 : #include <stdio.h>
22 : #include <stdlib.h>
23 : #include <string.h>
24 :
25 : #include <hashtable.h>
26 :
27 : #include <node.h>
28 : #include <abbreviations.h> /* free_graph() */
29 :
30 : unsigned int hash_djb2(const char *s);
31 : void rehash_hashtable(struct hashtable *h);
32 : void reallocate_buckets(struct hashtable *h);
33 : int sparsebit(long i);
34 : int msbbit(long i);
35 :
36 : /* chain is the dummy node at head of hash chain */
37 : void
38 : insert_node(struct hashnode *chain, struct hashnode *n)
39 6185 : {
40 6185 : n->next = chain->next;
41 6185 : chain->next = n;
42 6185 : n->next->prev = n;
43 6185 : n->prev = chain;
44 6185 : }
45 :
46 : struct hashtable *
47 : init_hashtable(int buckets, int maxload)
48 71 : {
49 : int i;
50 : int f, z;
51 :
52 : struct hashtable *h;
53 :
54 71 : h = malloc(sizeof(*h));
55 :
56 71 : h->flags = 0;
57 :
58 : /* Insure that bucket array size is always a multiple of 2.
59 : * This amounted to about 10% run-time savings for small files
60 : * by avoiding the '%' operator. */
61 :
62 71 : f = msbbit(buckets);
63 71 : z = 1 << f;
64 :
65 71 : if (z < buckets)
66 0 : z <<= 1;
67 :
68 71 : buckets = z;
69 :
70 : /* buckets and sentinels */
71 71 : h->buckets =
72 : (struct hashnode **)malloc(buckets*sizeof(*h->buckets));
73 71 : memset((char *)h->buckets, '\0', buckets*sizeof(*h->buckets));
74 :
75 71 : h->sentinels =
76 : (struct hashnode **)malloc(buckets*sizeof(*h->sentinels));
77 71 : memset((char *)h->sentinels, '\0', buckets*sizeof(*h->sentinels));
78 :
79 4615 : for (i = 0; i < buckets; ++i)
80 : {
81 : /* first node is a dummy - for convenience and to hold chain stats */
82 :
83 4544 : h->buckets[i] = (struct hashnode *)malloc(sizeof(*h->buckets[i]));
84 4544 : h->buckets[i]->data = NULL;
85 4544 : h->buckets[i]->string = NULL;
86 4544 : h->buckets[i]->string_length = 0;
87 :
88 4544 : h->sentinels[i] = (struct hashnode *)malloc(sizeof(*h->sentinels[i]));
89 4544 : h->sentinels[i]->data = NULL;
90 :
91 4544 : h->buckets[i]->next = h->sentinels[i];
92 4544 : h->buckets[i]->prev = h->buckets[i];
93 4544 : h->sentinels[i]->next = h->sentinels[i];
94 4544 : h->sentinels[i]->prev = h->buckets[i];
95 :
96 4544 : h->buckets[i]->string = NULL;
97 4544 : h->buckets[i]->value = 0; /* split count of chain */
98 4544 : h->buckets[i]->nodes_in_chain = 0; /* split count of chain */
99 : }
100 :
101 : /* bucket allocation */
102 71 : h->currentsize = buckets;
103 71 : h->allocated = buckets;
104 :
105 71 : h->maxload = maxload;
106 :
107 : /* bucket splitting */
108 71 : h->p = 0;
109 71 : h->maxp = buckets;
110 :
111 71 : h->node_cnt = 0;
112 71 : h->rehash_cnt = 0;
113 :
114 71 : return h;
115 : }
116 :
117 : void
118 : rehash_hashtable(struct hashtable *h)
119 391 : {
120 : struct hashnode *l;
121 : struct hashnode *oldbucket, *oldtail;
122 : int newindex;
123 :
124 391 : ++h->rehash_cnt;
125 :
126 391 : if (h->currentsize >= h->allocated)
127 3 : reallocate_buckets(h);
128 :
129 391 : oldbucket = h->buckets[h->p];
130 391 : oldtail = h->sentinels[h->p];
131 391 : l = oldbucket->next;
132 :
133 391 : oldbucket->next = oldtail;
134 391 : oldtail->prev = oldbucket;
135 :
136 391 : newindex = h->p + h->maxp;
137 :
138 391 : if (h->flags > 1)
139 : {
140 0 : printf("# %d rehashes, %d total nodes, old bucket %d, new bucket %d\n",
141 : h->rehash_cnt, h->node_cnt, h->p, newindex);
142 0 : printf("# %d nodes in old bucket\n", oldbucket->nodes_in_chain);
143 : }
144 :
145 391 : oldbucket->nodes_in_chain = 0; /* may not be any lines in chain after rehash */
146 391 : ++oldbucket->value; /* number of splits */
147 :
148 391 : ++h->p;
149 :
150 391 : if (h->p == h->maxp)
151 : {
152 2 : h->maxp *= 2;
153 2 : h->p = 0;
154 : }
155 :
156 391 : ++h->currentsize;
157 :
158 6967 : while (oldtail != l)
159 : {
160 6185 : struct hashnode *t = l->next;
161 6185 : int idx = MOD(l->value, h->maxp);
162 :
163 6185 : if (idx < h->p)
164 6143 : idx = MOD(l->value, (2*h->maxp));
165 :
166 6185 : if (idx == newindex)
167 : {
168 3116 : insert_node(h->buckets[newindex], l);
169 3116 : ++h->buckets[newindex]->nodes_in_chain;
170 : } else {
171 3069 : insert_node(oldbucket, l);
172 3069 : ++oldbucket->nodes_in_chain;
173 : }
174 :
175 6185 : l = t;
176 : }
177 :
178 391 : if (h->flags > 1)
179 : {
180 0 : printf("# split: %d in old chain, %d in new chain\n",
181 : oldbucket->nodes_in_chain, h->buckets[newindex]->nodes_in_chain);
182 : }
183 391 : }
184 :
185 : void
186 : reallocate_buckets(struct hashtable *h)
187 3 : {
188 : int i;
189 : int newallocated;
190 :
191 3 : newallocated = h->allocated * 2;
192 :
193 3 : h->buckets = (struct hashnode **)realloc(
194 : h->buckets,
195 : sizeof(struct hashnode *)*newallocated
196 : );
197 :
198 3 : h->sentinels = (struct hashnode **)realloc(
199 : h->sentinels,
200 : sizeof(struct hashnode *)*newallocated
201 : );
202 :
203 451 : for (i = h->allocated; i < newallocated; ++i)
204 : {
205 448 : h->buckets[i] = (struct hashnode *)malloc(sizeof(*h->buckets[i]));
206 448 : h->sentinels[i] = (struct hashnode *)malloc(sizeof(*h->sentinels[i]));
207 :
208 448 : h->buckets[i]->next = h->sentinels[i];
209 448 : h->sentinels[i]->prev = h->buckets[i];
210 448 : h->sentinels[i]->next = h->sentinels[i];
211 448 : h->buckets[i]->prev = h->buckets[i];
212 :
213 448 : h->buckets[i]->string = NULL;
214 448 : h->buckets[i]->data = NULL;
215 448 : h->sentinels[i]->string = NULL;
216 448 : h->sentinels[i]->data = NULL;
217 :
218 448 : h->buckets[i]->nodes_in_chain = 0;
219 448 : h->buckets[i]->value = 0; /* number of times chain splits, too */
220 : }
221 :
222 3 : h->allocated = newallocated;
223 3 : }
224 :
225 : const char *
226 : add_string(struct hashtable *h, const char *string)
227 6568 : {
228 6568 : return (const char *)add_data(h, string, NULL);
229 : }
230 :
231 : void *
232 : add_data(struct hashtable *h, const char *string, void *data)
233 6568 : {
234 : struct hashnode *n, *head;
235 : int bucket_index;
236 : unsigned int hv;
237 : int hash_mod;
238 :
239 6568 : if((n = node_lookup(h, string, &hv)))
240 : {
241 1141 : void *r = NULL;
242 1141 : if (data)
243 : {
244 0 : r = (void *)n->data;
245 0 : n->data = data;
246 : } else {
247 1141 : r = (void *)n->string;
248 : }
249 1141 : return r;
250 : }
251 :
252 5427 : bucket_index = MOD(hv, h->maxp);
253 5427 : hash_mod = h->maxp;
254 5427 : if (bucket_index < h->p)
255 : {
256 1835 : bucket_index = MOD(hv, (2*h->maxp));
257 1835 : hash_mod = 2*h->maxp;
258 : }
259 :
260 5427 : if (h->flags > 1)
261 0 : printf("# \"%s\": hash value %u, base %d, bucket %d\n", string, hv, hash_mod, bucket_index);
262 :
263 : /* allocate new node */
264 5427 : n = (struct hashnode *)malloc(sizeof(*n));
265 :
266 : /* fill in newly allocated node */
267 5427 : n->value = hv;
268 5427 : n->string_length = strlen(string);
269 5427 : n->string = malloc(n->string_length+1);
270 5427 : memcpy(n->string, string, n->string_length+1);
271 5427 : n->data = data;
272 :
273 : /* add newly allocated node to appropriate chain */
274 5427 : head = h->buckets[bucket_index];
275 :
276 : /* just push it on the chain */
277 5427 : n->next = head->next;
278 5427 : head->next = n;
279 5427 : n->next->prev = n;
280 5427 : n->prev = head;
281 :
282 5427 : ++h->buckets[bucket_index]->nodes_in_chain;
283 :
284 5427 : ++h->node_cnt;
285 :
286 : /* "load" on hashtable too high? */
287 5427 : if (h->node_cnt/h->currentsize > h->maxload)
288 391 : rehash_hashtable(h);
289 :
290 5427 : return (NULL == data)? (void *)n->string: NULL;
291 : }
292 :
293 : void free_hashtable(struct hashtable *h)
294 69 : {
295 : unsigned int i;
296 :
297 4876 : for (i = 0; i < h->currentsize; ++i)
298 : {
299 4807 : struct hashnode *chain = h->buckets[i];
300 19848 : while (chain != h->sentinels[i])
301 : {
302 10234 : struct hashnode *tmp = chain->next;
303 10234 : if (chain->string)
304 : {
305 5427 : free(chain->string);
306 5427 : chain->string = NULL;
307 : }
308 10234 : free_graph(chain->data);
309 10234 : chain->data = NULL;
310 10234 : free(chain);
311 10234 : chain = tmp;
312 : }
313 4807 : free(h->sentinels[i]);
314 4807 : h->sentinels[i] = NULL;
315 : }
316 :
317 : /* free the dummy heads and sentinels that the table
318 : * doesn't currently use - it doubles bucket count
319 : * every time the "load" gets too high */
320 126 : for (i = h->currentsize; i < h->allocated; ++i)
321 : {
322 57 : free(h->buckets[i]);
323 57 : h->buckets[i] = NULL;
324 57 : free(h->sentinels[i]);
325 57 : h->sentinels[i] = NULL;
326 : }
327 :
328 69 : free(h->buckets);
329 69 : free(h->sentinels);
330 69 : h->buckets = NULL;
331 69 : h->sentinels = NULL;
332 :
333 69 : free(h);
334 69 : h = NULL;
335 69 : }
336 :
337 : const char *
338 : string_lookup(struct hashtable *h, const char *string_to_lookup)
339 0 : {
340 0 : struct hashnode *hn = NULL;
341 0 : char *r = NULL;
342 : unsigned int hv; /* dummy - not used in this function */
343 :
344 0 : if (NULL != (hn = node_lookup(h, string_to_lookup, &hv)))
345 0 : r = hn->string;
346 :
347 0 : return r;
348 : }
349 :
350 : void *
351 : data_lookup(struct hashtable *h, const char *string_to_lookup)
352 1153 : {
353 1153 : struct hashnode *hn = NULL;
354 1153 : void *r = NULL;
355 : unsigned int hv; /* dummy - not used in this function */
356 :
357 1153 : if (NULL != (hn = node_lookup(h, string_to_lookup, &hv)))
358 1153 : r = hn->data;
359 :
360 1153 : return r;
361 : }
362 :
363 : /* note that 3rd argument must evaluate to non-NULL - returns hash value
364 : * of string_to_lookup to add_data() to allow it to skip doing a hash
365 : */
366 : struct hashnode *
367 : node_lookup(struct hashtable *h, const char *string_to_lookup, unsigned int *rhv)
368 12866 : {
369 12866 : struct hashnode *r = NULL;
370 12866 : unsigned int hashval = hash_djb2(string_to_lookup);
371 12866 : int idx = MOD(hashval, h->maxp);
372 : struct hashnode *chain;
373 :
374 12866 : *rhv = hashval;
375 :
376 12866 : if (idx < h->p)
377 3669 : idx = MOD(hashval, (2*h->maxp));
378 :
379 12866 : chain = h->buckets[idx]->next;
380 :
381 93656 : while (chain != h->sentinels[idx])
382 : {
383 : /* checking the equivalence of hash value first prevents
384 : * expensive byte-by-byte strcmp(). Seems to improve performance. */
385 75363 : if (chain->value == hashval &&
386 : !strcmp(chain->string, string_to_lookup))
387 : {
388 7439 : r = chain;
389 7439 : break;
390 : }
391 :
392 67924 : chain = chain->next;
393 : }
394 :
395 12866 : return r;
396 : }
397 :
398 : unsigned int
399 : hash_djb2(const char *str)
400 12866 : {
401 12866 : unsigned long hash = 5381;
402 : int c;
403 :
404 81696 : while ((c = *str++))
405 55964 : hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
406 :
407 12866 : return hash;
408 : }
409 :
410 : int
411 : sparsebit(long i)
412 71 : {
413 71 : int p = 0;
414 :
415 71 : if (i == 0) return(-1); /* no bits set */
416 :
417 71 : if ((i & (-i)) != i) return(-1); /* two or more bits set */
418 :
419 71 : if (i & 0xAAAAAAAA) p++;
420 71 : if (i & 0xCCCCCCCC) p += 2;
421 71 : if (i & 0xF0F0F0F0) p += 4;
422 71 : if (i & 0xFF00FF00) p += 8;
423 71 : if (i & 0xFFFF0000) p += 16;
424 :
425 71 : return p;
426 : }
427 :
428 : int
429 : msbbit(long i)
430 71 : {
431 : long i2;
432 :
433 71 : if (i<0) return(31); /* speed-up assumes a 'long's msb is at pos 31 */
434 :
435 71 : while ((i2 = (i & (i-1)))) i = i2;
436 :
437 71 : return(sparsebit(i));
438 : }
|