ITPub博客

首页 > 大数据 > 可视化 > 大数据处理算法之一 bloom filter 算法

大数据处理算法之一 bloom filter 算法

可视化 作者:wwo123456 时间:2013-10-30 16:38:41 0 删除 编辑
#include
#include
#include
using std::cout;
using std::endl;

inline static unsigned int GETBIT(char* vector, unsigned int pos)
{
    unsigned int tmp = vector[pos / 8] & 1<<(7 - pos%8);
    return tmp;
}
inline static void SETBIT(char* vector, unsigned int pos)
{
    vector[pos / 8] |= 1<<(7 - pos%8);
}

static unsigned int get_lines(const char* str)
{
    FILE* fp;
    if( (fp = fopen(str,"r")) == NULL)
    {
        cout << "read file error ";
        exit(1);
    }
    unsigned int num(0);
    char buf[256];
    while( fgets(buf, sizeof(buf), fp) )
    {
        ++ num;
        if(feof(fp))
            break;
    }
    fclose(fp);
    return num;
}

inline static double get_bit(const unsigned int& n, const float fasle_positive)
{
    return n * log(fasle_positive) / log(0.6185);
}

unsigned int FHash(char* str, unsigned int len)
{
    unsigned int hash = 0;
    unsigned int a = 6666;

    for(register int i(0); i
        hash = hash * a + (int)str[i];
    hash *= a;
    return hash;
}

static unsigned int length(char* str)
{
    char *p = str;
    int num = 0;
    while(*p++ != '')
        ++ num;
    return num;
}

int main(int argc, char* argv[])
{
    static const float fasle_positive = 0.01;
    //const char* file = "/ifs2/user/chr.txt";
    const char* file = "/ifs2/user/ceshi.txt";

    unsigned int lines = get_lines(file);
    static int mBit = (int)get_bit(lines, fasle_positive);
    static int kHash = (int)(log(fasle_positive) * -1/log(2));

    fprintf(stdout,"mBit %d kHash %d ", mBit, kHash);

    unsigned int size = mBit >> 2;
    char *cp = new char[size];
    for(register int i(0); i
        cp[i] = 0;

    FILE* fp;
    if( (fp = fopen(file, "rb")) == NULL)
    {
        cout << "read file error ";
        exit(1);
    }
    char line[256];
    unsigned int not_repeated(0);
    while(! feof(fp))
    {
        fgets(line, sizeof(line), fp);
        unsigned int num(0);
        int len = length(line);
        if( GETBIT(cp, AHash(line, len)% mBit)) // 1
            ++ num;
        else
            SETBIT(cp, AHash(line, len)% mBit); // set to 1

        if( GETBIT(cp, BHash(line, len)% mBit)) // 1
            ++ num;
        else
            SETBIT(cp, BHash(line, len)% mBit); // set to 1

        if( GETBIT(cp, CHash(line, len)% mBit)) // 1
            ++ num;
        else
            SETBIT(cp, CHash(line, len)% mBit); // set to 1

        if( GETBIT(cp, DHash(line, len)% mBit)) // 1
            ++ num;
        else
            SETBIT(cp, DHash(line, len)% mBit); // set to 1

        if( GETBIT(cp, EHash(line, len)% mBit)) // 1
            ++ num;
        else
            SETBIT(cp, EHash(line, len)% mBit); // set to 1

        if( GETBIT(cp, FHash(line, len)% mBit)) // 1
            ++ num;
        else
            SETBIT(cp, FHash(line, len)% mBit); // set to 1

        if(num < 6)
            ++ not_repeated;
        else
            cout << "repeated: " << line;
    }
    fclose(fp);
    delete [] cp;

    fprintf(stdout,"There are %d lines ", lines);
    fprintf(stdout,"not repeated lines %d ", not_repeated);
    fprintf(stdout,"repeated lines %d ", lines - not_repeated);
    return 0;
}
<!-- 正文结束 -->

来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/21525152/viewspace-1120887/,如需转载,请注明出处,否则将追究法律责任。

上一篇: 没有了~
下一篇: 没有了~
请登录后发表评论 登录
全部评论

注册时间:2009-04-30