// bloom filter
// gcc bloom.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
// the constants you can play with
#define SIZE 100 // total bits used in filter these will be divided between rows
#define MAXROWS 100 // we will test permutations up to this number of rows
#define POPULATION_COUNT 10 // number of hits stored
#define TESTS 10000 // number of random items to test
#define STRING_LENGTH 10 // string length of the items we randomly generate
//
// For our simulation to work we need a reasonable proficient hashing
// algorithm that will produce a randomly different result for
// each row.
//
unsigned long hash(const char *str, unsigned long hash)
{
unsigned int i = 0;
int length = strlen(str);
for (i = 0; i < length; ++str, ++i)
{
hash ^= ((hash << 5) + (*str) + (hash >> 2));
}
return hash;
}
// hashing algorithms tend to work better base on primes
// Exercise: Does it really matter?
static int IsPrime(int number)
{
for (int i = 2; i < number; i++)
{
if (number % i == 0 && i != number)
return 0;
}
return 1;
}
//
// create test/hit item
//
char *random_string()
{
char *s = calloc(STRING_LENGTH + 1, 1);
for (int i = 0; i < STRING_LENGTH; i++)
{
s[i] = 'a' + rand() % 26; // may as well be printable
}
return s;
}
int main(int argc, char *argv[])
{
unsigned long seeds[MAXROWS];
srand(time(0));
// create unique random seeds for our hashing algorithm
for (int i = 0; i < MAXROWS; i++)
{
int dup;
do
{
dup = 0;
seeds[i] = rand();
if (!IsPrime(seeds[i])) // Only use primes?
continue;
// very unlikely we would duplicate, but lets check anyway
for (int j = 0; j < i; j++)
{
if (seeds[i] == seeds[j])
{
dup = 1;
break;
}
}
} while (dup);
}
printf("Population Density %d in %d bits %f%%\n", POPULATION_COUNT, SIZE, (float)100 * POPULATION_COUNT / (float)SIZE);
//
// Our bitmap size is SIZE, this will be split over one bitmap, then two bitmaps
// all the way down to MAXROWS, you can take it all the way so a bitmap has only
// bit which should show all false postitives
//
for (int tr = 1; tr <= MAXROWS; tr++)
{
if (SIZE % tr != 0)
{
continue; // only check for bitmaps that will use dead on SIZE bits overall
}
int clashes = 0;
unsigned char **bitmaps; // hey I'm using chars for bits, this is example code not a real bloom filter
// allocate the bitmap arrays
bitmaps = malloc(tr * sizeof(char *));
int mapsize = SIZE / tr;
for (int r = 0; r < tr; r++)
{
bitmaps[r] = calloc(mapsize, 1);
}
//
// populate with positives
//
for (int i = 0; i < POPULATION_COUNT; i++)
{
char *positive = random_string();
//printf("%s\n",positive);
// created populate each row
for (int r = 0; r < tr; r++)
{
unsigned long offset = hash(positive, seeds[r]) % mapsize;
if (bitmaps[r][offset] == 1)
clashes++;
bitmaps[r][offset] = 1;
}
free(positive);
}
// do the bitmaps look sensible. e.g. no patterns
/*for (int r = 0; r < tr; r++)
{
for (int i = 0; i<mapsize;i++)
{
printf("%c",bitmaps[r][i]==0?'0':'1');
}
printf("\n");
}*/
int false_matches = 0;
// generate test cases
for (int i = 0; i < TESTS; i++)
{
//char out[2000];
char *sample = random_string();
int alert = 1; // start off thinking it may be positive
// created populate each row
//out[0] = '\0';
for (int r = 0; r < tr; r++)
{
unsigned long offset = hash(sample, seeds[r]) % mapsize;
//printf(" %ld ",offset);
// sprintf(&out[strlen(out)],"(%ld %ld %ld)",offset,hash(sample, seeds[r]),seeds[r]);
//sprintf(&out[strlen(out)],"(%ld)",offset);
if (bitmaps[r][offset] == 0)
{
alert = 0;
break;
}
}
//if (alert)printf("%s %s\n",sample,out);
false_matches += alert;
free(sample);
}
printf("Rows %d map size %d false positives %d Percent %f%% clashes %d\n", tr, mapsize, false_matches, (float)100 * false_matches / (float)TESTS, clashes);
// free the bitmap arrays
for (int r = 0; r < tr; r++)
{
free(bitmaps[r]);
}
free(bitmaps);
}
}
Comments
Post a Comment