Skip to content

Commit 954b58c

Browse files
committed
hashread improved using a fixed prefix
1 parent e94679a commit 954b58c

7 files changed

Lines changed: 274 additions & 96 deletions

File tree

gclib/GBase.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -690,6 +690,30 @@ int strhash(const char* str){
690690
return h;
691691
}
692692

693+
int djb_hash(const char* cp)
694+
{
695+
int h = 5381;
696+
while (*cp)
697+
h = (int)(33 * h ^ (unsigned char) *cp++);
698+
return (h & 0x7FFFFFFF); //always positive
699+
//return h;
700+
//return absolute value of this int:
701+
//int mask = (h >> (sizeof(int) * CHAR_BIT - 1));
702+
//return (h + mask) ^ mask;
703+
}
704+
705+
/* Fowler/Noll/Vo (FNV) hash function, variant 1a */
706+
int fnv1a_hash(const char* cp) {
707+
int h = 0x811c9dc5;
708+
while (*cp) {
709+
h ^= (unsigned char) *cp++;
710+
h *= 0x01000193;
711+
}
712+
//return h;
713+
return (h & 0x7FFFFFFF);
714+
}
715+
716+
693717
// removes the last part (file or directory name) of a full path
694718
// this is a destructive operation for the given string!!!
695719
// the trailing '/' is guaranteed to be there

gclib/GBase.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,9 @@ bool endsWith(const char* s, const char* suffix);
282282
// ELF hash function for strings
283283
int strhash(const char* str);
284284

285-
285+
//alternate hash functions:
286+
int fnv1a_hash(const char* cp);
287+
int djb_hash(const char* cp);
286288

287289
//---- generic base GSeg : genomic segment (interval) --
288290
// coordinates are considered 1-based (so 0 is invalid)

gclib/GBitVec.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,10 @@ class GBitVec {
123123
if (value)
124124
clear_unused_bits();
125125
}
126+
unsigned long getMemorySize() const {
127+
unsigned long r = ((unsigned long) Capacity) * sizeof(BitWord);
128+
return r;
129+
}
126130

127131
/// GBitVec copy ctor.
128132
GBitVec(const GBitVec &RHS) : Size(RHS.size()) {

gclib/GHash.hh

Lines changed: 161 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,11 @@
1111
* indexed by a character string (essentially, maps strings to pointers)
1212
*/
1313

14+
//#define HASH_DBG_PRINT 1
15+
16+
#define GSTR_HASH(s) strhash(s)
17+
//#define GSTR_HASH(s) djb_hash(s)
18+
//#define GSTR_HASH(s) fnv1a_hash(s)
1419

1520
template <class OBJ> class GHash {
1621
protected:
@@ -62,17 +67,22 @@ public:
6267
int Capacity() const { return fCapacity; } // table's size, including the empty slots.
6368
void Resize(int m); // Resize the table to the given size.
6469
int Count() const { return fCount; }// the total number of entries in the table.
70+
6571
// Insert a new entry into the table given key and mark.
6672
// If there is already an entry with that key, leave it unchanged,
67-
const OBJ* Add(const char* ky, const OBJ* ptr=NULL, bool mrk=false);
73+
OBJ* Add(const char* ky, OBJ* ptr=NULL, bool mrk=false);
74+
75+
//same with Add, but frees the old element if it's a replacement
76+
OBJ* fAdd(const char* ky, OBJ* ptr=NULL);
77+
6878
//same as Add, but the key pointer is stored directly, no string duplicate
6979
//is made (shared-key-Add)
70-
const OBJ* shkAdd(const char* ky, const OBJ* ptr, bool mrk=false);
80+
OBJ* shkAdd(const char* ky, OBJ* ptr, bool mrk=false);
7181

7282
// Replace data at key, if the entry's mark is less than
7383
// or equal to the given mark. If there was no existing entry,
7484
// a new entry is inserted with the given mark.
75-
OBJ* Replace(const char* ky, const OBJ* ptr, bool mrk=false);
85+
OBJ* Replace(const char* ky, OBJ* ptr, bool mrk=false);
7686
// Remove a given key and its data
7787
OBJ* Remove(const char* ky);
7888
// Find data OBJ* given key.
@@ -197,37 +207,141 @@ template <class OBJ> void GHash<OBJ>::Resize(int m){
197207
}
198208

199209
// add a new entry, or update it if it already exists
200-
template <class OBJ> const OBJ* GHash<OBJ>::Add(const char* ky,
201-
const OBJ* pdata,bool mrk){
210+
211+
212+
template <class OBJ> OBJ* GHash<OBJ>::Add(const char* ky,
213+
OBJ* pdata, bool mrk){
214+
register int p,i,x,h,n;
215+
if(!ky) GError("GHash::insert: NULL key argument.\n");
216+
GASSERT(fCount<fCapacity);
217+
h=GSTR_HASH(ky);
218+
GASSERT(0<=h);
219+
p=HASH1(h,fCapacity);
220+
GASSERT(0<=p && p<fCapacity);
221+
x=HASH2(h,fCapacity);
222+
GASSERT(1<=x && x<fCapacity);
223+
i=-1;
224+
n=fCapacity;
225+
#ifdef HASH_DBG_PRINT
226+
int iterations=0;
227+
int init_p=p;
228+
int init_x=x;
229+
#endif
230+
while(n && hash[p].hash!=-1) {
231+
if ((i==-1)&&(hash[p].hash==-2)) i=p;
232+
if (hash[p].hash==h && strcmp(hash[p].key,ky)==0) {
233+
//replace hash data for this key!
234+
lastkeyptr=hash[p].key;
235+
OBJ* r = (OBJ*) hash[p].data;
236+
hash[p].data = (void*) pdata;
237+
#ifdef HASH_DBG_PRINT
238+
GMessage("Add.R\t%s\t%d,%d,%d\t%d\t%d\t%d\n",
239+
ky, h,init_p,init_x, iterations, fCount, fCapacity);
240+
#endif
241+
//return (OBJ*)hash[p].data;
242+
return r;
243+
}
244+
p=(p+x)%fCapacity;
245+
n--;
246+
}
247+
if(i==-1) i=p;
248+
#ifdef HASH_DBG_PRINT
249+
GMessage("Add.N\t%s\t%d,%d,%d\t%d\t%d\t%d\n",
250+
ky, h,init_p,init_x, iterations, fCount, fCapacity);
251+
#endif
252+
GTRACE(("GHash::insert: key=\"%s\"\n",ky));
253+
//GMessage("GHash::insert: key=\"%s\"\n",ky);
254+
GASSERT(0<=i && i<fCapacity);
255+
GASSERT(hash[i].hash<0);
256+
hash[i].hash=h;
257+
hash[i].mark=mrk;
258+
hash[i].key=Gstrdup(ky);
259+
hash[i].keyalloc=true;
260+
lastkeyptr=hash[i].key;
261+
hash[i].data= (void*) pdata;
262+
fCount++;
263+
if((100*fCount)>=(MAX_LOAD*fCapacity)) Resize(fCount);
264+
GASSERT(fCount<fCapacity);
265+
return pdata;
266+
}
267+
268+
/*
269+
int p,i,x,h;
270+
if(!ky) GError("GHash::insert: NULL key argument.\n");
271+
GASSERT(fCount<fCapacity);
272+
h=GSTR_HASH(ky);
273+
GASSERT(0<=h);
274+
p=HASH1(h,fCapacity);
275+
GASSERT(0<=p && p<fCapacity);
276+
x=HASH2(h,fCapacity);
277+
GASSERT(1<=x && x<fCapacity);
278+
if (checkReplace(ky, pdata, p, i, h, x)) {
279+
return (OBJ*)hash[p].data;
280+
}
281+
GTRACE(("GHash::insert: key=\"%s\"\n",ky));
282+
//GMessage("GHash::insert: key=\"%s\"\n",ky);
283+
GASSERT(0<=i && i<fCapacity);
284+
GASSERT(hash[i].hash<0);
285+
hash[i].hash=h;
286+
hash[i].mark=mrk;
287+
hash[i].key=Gstrdup(ky);
288+
hash[i].keyalloc=true;
289+
lastkeyptr=hash[i].key;
290+
hash[i].data= (void*) pdata;
291+
fCount++;
292+
if((100*fCount)>=(MAX_LOAD*fCapacity)) Resize(fCount);
293+
GASSERT(fCount<fCapacity);
294+
return pdata;
295+
}
296+
*/
297+
template <class OBJ> OBJ* GHash<OBJ>::fAdd(const char* ky,
298+
OBJ* pdata){
202299
register int p,i,x,h,n;
203300
if(!ky) GError("GHash::insert: NULL key argument.\n");
204301
GASSERT(fCount<fCapacity);
205-
h=strhash(ky);
302+
h=GSTR_HASH(ky);
206303
GASSERT(0<=h);
207304
p=HASH1(h,fCapacity);
208305
GASSERT(0<=p && p<fCapacity);
209306
x=HASH2(h,fCapacity);
210307
GASSERT(1<=x && x<fCapacity);
211308
i=-1;
212309
n=fCapacity;
213-
while(n && hash[p].hash!=-1){
310+
#ifdef HASH_DBG_PRINT
311+
int iterations=0;
312+
int init_p=p;
313+
int init_x=x;
314+
#endif
315+
while(n && hash[p].hash!=-1) {
214316
if ((i==-1)&&(hash[p].hash==-2)) i=p;
215317
if (hash[p].hash==h && strcmp(hash[p].key,ky)==0) {
216318
//replace hash data for this key!
217319
lastkeyptr=hash[p].key;
320+
if (FREEDATA) (*fFreeProc)(hash[p].data);
218321
hash[p].data = (void*) pdata;
219-
return (OBJ*)hash[p].data;
322+
#ifdef HASH_DBG_PRINT
323+
GMessage("Add.R\t%s\t%d,%d,%d\t%d\t%d\t%d\n",
324+
ky, h,init_p,init_x, iterations, fCount, fCapacity);
325+
#endif
326+
return pdata;
220327
}
221328
p=(p+x)%fCapacity;
329+
#ifdef HASH_DBG_PRINT
330+
++iterations;
331+
#endif
222332
n--;
223333
}
224334
if(i==-1) i=p;
335+
#ifdef HASH_DBG_PRINT
336+
GMessage("Add.N\t%s\t%d,%d,%d\t%d\t%d\t%d\n",
337+
ky, h,init_p,init_x, iterations, fCount, fCapacity);
338+
#endif
225339
GTRACE(("GHash::insert: key=\"%s\"\n",ky));
226340
//GMessage("GHash::insert: key=\"%s\"\n",ky);
227341
GASSERT(0<=i && i<fCapacity);
228342
GASSERT(hash[i].hash<0);
229343
hash[i].hash=h;
230-
hash[i].mark=mrk;
344+
hash[i].mark=false;
231345
hash[i].key=Gstrdup(ky);
232346
hash[i].keyalloc=true;
233347
lastkeyptr=hash[i].key;
@@ -238,12 +352,14 @@ template <class OBJ> const OBJ* GHash<OBJ>::Add(const char* ky,
238352
return pdata;
239353
}
240354

241-
template <class OBJ> const OBJ* GHash<OBJ>::shkAdd(const char* ky,
242-
const OBJ* pdata,bool mrk){
355+
356+
357+
template <class OBJ> OBJ* GHash<OBJ>::shkAdd(const char* ky,
358+
OBJ* pdata,bool mrk){
243359
register int p,i,x,h,n;
244360
if(!ky) GError("GHash::insert: NULL key argument.\n");
245361
GASSERT(fCount<fCapacity);
246-
h=strhash(ky);
362+
h=GSTR_HASH(ky);
247363
GASSERT(0<=h);
248364
p=HASH1(h,fCapacity);
249365
GASSERT(0<=p && p<fCapacity);
@@ -281,11 +397,11 @@ template <class OBJ> const OBJ* GHash<OBJ>::shkAdd(const char* ky,
281397

282398

283399
// Add or replace entry
284-
template <class OBJ> OBJ* GHash<OBJ>::Replace(const char* ky,const OBJ* pdata, bool mrk){
400+
template <class OBJ> OBJ* GHash<OBJ>::Replace(const char* ky, OBJ* pdata, bool mrk){
285401
register int p,i,x,h,n;
286402
if(!ky){ GError("GHash::replace: NULL key argument.\n"); }
287403
GASSERT(fCount<fCapacity);
288-
h=strhash(ky);
404+
h=GSTR_HASH(ky);
289405
GASSERT(0<=h);
290406
p=HASH1(h,fCapacity);
291407
GASSERT(0<=p && p<fCapacity);
@@ -328,7 +444,7 @@ template <class OBJ> OBJ* GHash<OBJ>::Remove(const char* ky){
328444
if(!ky){ GError("GHash::remove: NULL key argument.\n"); }
329445
OBJ* removed=NULL;
330446
if(0<fCount){
331-
h=strhash(ky);
447+
h=GSTR_HASH(ky);
332448
GASSERT(0<=h);
333449
p=HASH1(h,fCapacity);
334450
GASSERT(0<=p && p<fCapacity);
@@ -364,7 +480,7 @@ template <class OBJ> bool GHash<OBJ>::hasKey(const char* ky) {
364480
register int p,x,h,n;
365481
if(!ky){ GError("GHash::find: NULL key argument.\n"); }
366482
if(0<fCount){
367-
h=strhash(ky);
483+
h=GSTR_HASH(ky);
368484
GASSERT(0<=h);
369485
p=HASH1(h,fCapacity);
370486
GASSERT(0<=p && p<fCapacity);
@@ -383,30 +499,45 @@ template <class OBJ> bool GHash<OBJ>::hasKey(const char* ky) {
383499
return false;
384500
}
385501

502+
386503
template <class OBJ> OBJ* GHash<OBJ>::Find(const char* ky, char** keyptr){
387504
register int p,x,h,n;
388505
if(!ky){ GError("GHash::find: NULL key argument.\n"); }
389-
if(0<fCount){
390-
h=strhash(ky);
391-
GASSERT(0<=h);
392-
p=HASH1(h,fCapacity);
393-
GASSERT(0<=p && p<fCapacity);
394-
x=HASH2(h,fCapacity);
395-
GASSERT(1<=x && x<fCapacity);
396-
GASSERT(fCount<fCapacity);
397-
n=fCapacity;
398-
while(n && hash[p].hash!=-1){
506+
if (fCount==0) return NULL;
507+
h=GSTR_HASH(ky);
508+
GASSERT(0<=h);
509+
p=HASH1(h,fCapacity);
510+
GASSERT(0<=p && p<fCapacity);
511+
x=HASH2(h,fCapacity);
512+
GASSERT(1<=x && x<fCapacity);
513+
GASSERT(fCount<fCapacity);
514+
n=fCapacity;
515+
#ifdef HASH_DBG_PRINT
516+
int iterations=0;
517+
int init_p=p;
518+
int init_x=x;
519+
#endif
520+
while(n && hash[p].hash!=-1){
399521
if(hash[p].hash==h && strcmp(hash[p].key,ky)==0){
400522
if (keyptr!=NULL) *keyptr = hash[p].key;
523+
#ifdef HASH_DBG_PRINT
524+
GMessage("Found \t%s\t%d,%d,%d\t%d\t%d\t%d\n",
525+
ky, h,init_p,init_x, iterations, fCount, fCapacity);
526+
#endif
401527
return (OBJ*)hash[p].data;
402528
}
403529
p=(p+x)%fCapacity;
404530
n--;
405-
}
406-
}
407-
return NULL;
531+
#ifdef HASH_DBG_PRINT
532+
++iterations;
533+
#endif
408534
}
409-
535+
#ifdef HASH_DBG_PRINT
536+
GMessage("Nfound\t%s\t%d,%d,%d\t%d\t%d\t%d\n",
537+
ky, h,init_p,init_x, iterations, fCount, fCapacity);
538+
#endif
539+
return NULL;
540+
}
410541

411542
template <class OBJ> void GHash<OBJ>::startIterate() {// initialize a key iterator; call
412543
fCurrentEntry=0;

0 commit comments

Comments
 (0)