· 5 years ago · Sep 04, 2020, 05:48 AM
1/* reads words, one-per line into a dynamically allocated and sized hashtable
2 * storage for words is dynamically allocated to exact length, table is
3 * re-hashed when load-factor exceeds 0.6. The header, source and example
4 * files are shown below, but can be compiled together in this single exmple.
5 * when separated into separate files, the hastable struct and list struct
6 * are opaque to the example file with typedefs and forward declarations
7 * being all that is visible to the user if compiled into a library.
8 * openSSL LH_strhash() function is used for hashing.
9 */
10
11/* =========================== wordhash_tbl.h =========================== */
12
13#ifndef _wordhash_tbl_h_
14#define _wordhash_tbl_h_ 1
15
16#include <stdio.h>
17
18#define HTSZ 256 /* default number of buckets for new hashtable */
19#define LOADFACT 0.6F /* never > .7 (buckets filled / total) */
20
21typedef struct _list_t list_t; /* typedef for opaque hashtable list */
22typedef struct _hashtable_t hashtable_t; /* typedef for opaque hashtable struct */
23
24int ht_size (hashtable_t *ht); /* size (total no. of buckets) */
25int ht_filled (hashtable_t *ht); /* buckets filled (used) */
26size_t ht_words (hashtable_t *ht); /* total words in hashtable */
27float ht_load (hashtable_t *ht); /* current load-factor (filled / total) */
28
29hashtable_t *ht_create (size_t size); /* create new hashtable (if 0 HTSZ used) */
30
31/* lookup functions */
32list_t *ht_lookup_str (hashtable_t *ht, const char *s);
33/* takes hash to avoid re-hash of same word if hash already computed */
34list_t *ht_lookup_hash (hashtable_t *ht, const char *s, unsigned int hval);
35
36/* insert string, delete string and free hashtable funcitons */
37int ht_insert_str (hashtable_t **ht, const char *s);
38void ht_delete_str (hashtable_t *ht, const char *s);
39void ht_free (hashtable_t *ht);
40
41#endif
42
43
44
45/* =========================== wordhash_tbl.c =========================== */
46
47#include <stdio.h>
48#include <stdlib.h>
49#include <string.h>
50#include <limits.h>
51#include <stdint.h>
52
53#include "wordhash_tbl.h"
54
55struct _list_t { /* opaque struct for hashtable list */
56 char *word;
57 struct _list_t *next;
58};
59
60struct _hashtable_t { /* opaque struct for hashtable */
61 int size;
62 int filled;
63 size_t words;
64 list_t **table;
65};
66
67int ht_size (hashtable_t *ht)
68{
69 return ht->size;
70}
71
72int ht_filled (hashtable_t *ht)
73{
74 return ht->filled;
75}
76
77size_t ht_words (hashtable_t *ht)
78{
79 return ht->words;
80}
81
82/** create hashtable of size 'size', return pointer to hashtable.
83 * allocate size buckets and validate, initializing all
84 * pointers NULL, setting ht->size to 'size' and filled = 0.
85 * returns pointer to allocated hashtable on success, NULL otherwise.
86 */
87hashtable_t *ht_create (size_t size)
88{
89 hashtable_t *ht;
90
91 if (size < 1) return NULL;
92
93 /* allocate hashtable */
94 if (!(ht = calloc (1, sizeof *ht)))
95 return NULL;
96
97 /* allocate & INITIALIZE element pointers NULL */
98 if (!(ht->table = calloc (size, sizeof *(ht->table))))
99 return NULL;
100
101 ht->size = size;
102 ht->filled = 0;
103 ht->words = 0;
104
105 return ht;
106}
107
108/** OPENSSL_LH_strhash
109 * The following hash seems to work very well on normal text words no
110 * collisions on /usr/dict/words and it distributes on %2^n quite well, not
111 * as good as MD5, but still good.
112 */
113uint32_t lh_strhash (const char *s)
114{
115 uint64_t ret = 0, v;
116 int64_t n = 0x100;
117 int32_t r;
118
119 if (!s || !*s) return (ret);
120
121 for (; *s; s++) {
122 v = n | (*s);
123 n += 0x100;
124 r = (int32_t)((v >> 2) ^ v) & 0x0f;
125 ret = (ret << r) | (ret >> (32 - r));
126 ret &= 0xFFFFFFFFL;
127 ret ^= v * v;
128 }
129 return ((ret >> 16) ^ ret);
130}
131
132/** wrapper for lh_strhash returning key (index), 0 - (ht->size - 1).
133 * modulo ht-size insures hash is coverted to key that
134 * corresponds to an index within the allocated hashtable size.
135 */
136uint32_t hash (hashtable_t *ht, const char *s)
137{
138 return lh_strhash (s) % ht->size;
139}
140
141list_t *ht_lookup_str (hashtable_t *ht, const char *s)
142{
143 if (!ht || !s) return NULL;
144
145 list_t *list;
146 uint32_t hashval = hash (ht, s);
147
148 /* go to the correct list based on the hash value and see if str is
149 * in the list. if found, return return a pointer to the list element,
150 * otherwise, return NULL.
151 */
152 for (list = (ht->table)[hashval]; list; list = list->next) {
153 if (strcmp (s, list->word) == 0) return list;
154 }
155
156 return NULL;
157}
158
159/** lookup prevents hashing twice when called from within a function */
160list_t *ht_lookup_hash (hashtable_t *ht, const char *s, unsigned int hval)
161{
162 if (!ht) return NULL;
163
164 for (list_t *list = (ht->table)[hval]; list; list = list->next) {
165 if (strcmp (s, list->word) == 0) return list;
166 }
167
168 return NULL;
169}
170
171/** resize hashtable increase by 2x current, rehashing all keys.
172 * double the current number of buckets, moving current list nodes
173 * to new buckets/lists within the resized hashtable, free existing
174 * ht->table returning pointer to resized hashtable on success,
175 * NULL otherwise.
176 */
177hashtable_t *ht_rehash (hashtable_t *ht)
178{
179 size_t htsz = ht->size * 2; /* double current hashtable size */
180 hashtable_t *nht = NULL;
181
182 if (!(nht = ht_create (htsz))) /* allocate new hashtable, validate */
183 return NULL;
184
185 nht->size = htsz; /* assign new size */
186
187 for (int i = 0; i < ht->size; i++) /* for each old bucket */
188 for (list_t *l = (ht->table)[i]; l;) { /* for each list entry */
189 list_t *nextlp = l->next; /* save next list ptr */
190 uint32_t nhash = hash (nht, l->word); /* compute new hash */
191 l->next = (nht->table)[nhash]; /* set next to index */
192 if (!(nht->table)[nhash]) /* set filled buckets */
193 (nht->filled)++;
194 (nht->table)[nhash] = l; /* set index to list */
195 l = nextlp; /* assign next old to iterate */
196 }
197
198 free (ht->table); /* free old list ptrs */
199 free (ht); /* free old hashtable */
200
201 return nht;
202}
203
204/** insert string into hashtable, check load, resize/rehash.
205 * returns 0 if string placed in new bucket 1 if string exists,
206 * otherwise, return -1 on invalid parameter or allocation failure,
207 * -2 on resize/rehash failure.
208 */
209int ht_insert_str (hashtable_t **ht, const char *s)
210{
211 if (!ht || !s) return -1;
212
213 if (!*ht) /* if hashtable NULL */
214 if (!(*ht = ht_create (HTSZ))) /* create empty hashtable & validate */
215 return -1;
216
217 list_t *new_list, *current_list;
218 uint32_t hashval = hash (*ht, s);
219 size_t len = 0;
220
221 /* Does item already exist? */
222 if ((current_list = ht_lookup_hash (*ht, s, hashval))) return 1;
223
224 /* allocate / validate new list node */
225 if (!(new_list = malloc (sizeof *new_list))) return -1;
226
227 if (!((*ht)->table)[hashval]) (*ht)->filled++; /* no. of filled buckets */
228
229 /* Insert into list */
230 len = strlen(s); /* get length of string */
231 new_list->word = malloc (len + 1); /* allocate string storage */
232 if (!new_list->word) { /* validate allocation */
233 perror ("malloc-new_list->word");
234 return -1;
235 }
236 memcpy (new_list->word, s, len + 1); /* copy string to new storage */
237 (*ht)->words++; /* increment word count */
238 new_list->next = ((*ht)->table)[hashval]; /* set next to key addr (ptr is NULL) */
239 ((*ht)->table)[hashval] = new_list; /* set key to list */
240
241 while (ht_load (*ht) > LOADFACT) { /* check load factor, resize/rehash */
242#ifdef DEBUG
243 printf (" ht_load : %.2f = %d / %d => %d\n",
244 ht_load (*ht), (*ht)->filled, (*ht)->size, (*ht)->size * 2);
245#endif
246 hashtable_t *tmp = ht_rehash (*ht);
247 if (!tmp)
248 return -2;
249 *ht = tmp;
250 }
251
252 return 0;
253}
254
255/** custom free for each type of list_t.
256 * (if not more than one member, just handle in ht_free)
257 */
258/*
259void ht_free_node (list_t *lp)
260{
261 free (lp->word);
262 free (lp->defn);
263 free (lp);
264}
265*/
266
267void ht_delete_str (hashtable_t *ht, const char *s)
268{
269 if (!ht || !s) return;
270
271 list_t *list, *victim, **pplist;
272 uint32_t hashval = hash (ht, s);
273
274 /* lookup element, bail if it does not exist */
275 if (!(victim = ht_lookup_hash (ht, s, hashval))) return;
276
277 list = ht->table[hashval];
278 pplist = &list;
279
280 for (; list; pplist = &list->next, list = list->next)
281 if (list == victim) {
282 *pplist = list->next;
283 free (victim->word);
284 free (victim);
285 ht->words--;
286 break;
287 }
288}
289
290void ht_free (hashtable_t *ht)
291{
292 int i;
293 list_t *list, *victim;
294
295 if (!ht) return;
296
297 /* Free the memory for every item in the table, including the
298 * words themselves.
299 */
300 for (i = 0; i < ht->size; i++) {
301 list = (ht->table)[i];
302 while (list) {
303 victim = list;
304 list = list->next;
305 free (victim->word);
306 free (victim);
307 }
308 }
309
310 /* Free the table itself */
311 free (ht->table);
312 free (ht);
313}
314
315float ht_load (hashtable_t *ht)
316{
317 return ht ? (float)ht->filled/ht->size : -1.0F;
318}
319
320
321/* =========================== wordhash_tst.c =========================== */
322
323#include "wordhash_tbl.h"
324
325#define MAXC 64 /* max characters in read buffer for words */
326#define TBLSIZE 256 /* initial number of buckets in table */
327
328int main (int argc, char **argv) {
329
330 hashtable_t *ht = NULL; /* pointer to hashtable */
331 char buf[MAXC]; /* buffer for reading from file */
332 /* use filename provided as 1st argument (stdin by default) */
333 FILE *fp = argc > 1 ? fopen (argv[1], "r") : stdin;
334
335 if (!fp) { /* validate file open for reading */
336 perror ("file open failed");
337 return 1;
338 }
339
340 if (!(ht = ht_create (TBLSIZE))) { /* create/validate hashtable */
341 perror ("ht_create (TBLSIZE)");
342 exit (EXIT_FAILURE);
343 }
344
345 while (fgets (buf, MAXC, fp)) {
346 int rtn; /* variable for insert return */
347 buf[strcspn(buf, "\n")] = 0; /* trim trailing '\n' from buf */
348 /* insert word in hashtable/validate */
349 if ((rtn = ht_insert_str (&ht, buf)) < 0) {
350 if (rtn == -1)
351 fputs ("error: bad parameter or allocation failure\n", stderr);
352 else
353 fputs ("error: resize/rehash failure\n", stderr);
354 exit (EXIT_FAILURE);
355 }
356 }
357
358 if (fp != stdin) /* close file if not stdin */
359 fclose (fp);
360
361 printf ("hast table size : %d\n"
362 "buckets filled : %d\n"
363 "load factor : %.2f\n"
364 "words in table : %zu\n",
365 ht_size(ht), ht_filled(ht), ht_load(ht), ht_words(ht));
366
367 ht_free (ht);
368}
369