[Clamav-devel] clamav-devel Digest, Vol 111, Issue 12 AC algorithm optimization

liu31380 at gmail.com liu31380 at gmail.com
Fri May 2 00:05:08 EDT 2014


Sorry for late reply.
My implementation as follow:
I modified the original algorithm, when cur state is root, using the 2-gram hash table to accelerate filtering.
In following implementation, building dat table dependent on the Trie structure that had been builded.

References:
http://ieeexplore.ieee.org/xpl/login.jsp?tp=&arnumber=31365&url=http%3A%2F%2Fieeexplore.ieee.org%2Fxpls%2Fabs_all.jsp%3Farnumber%3D31365

---------------------------------------
#include <stdio.h>

#include <stdlib.h>

#include "common.h"

#include "my_dat_ac.h"



#define T_SIZE 655360

#define printf(fmt,arg...)







struct input_list {

	unsigned char node;

	struct bfs_list *next;

};



enum {

	T_MUL_ST = 1,   // multi-state

	T_SIN_ST,   // sigle-state

	T_TER_ST,   // terminal-state

	T_SEP_ST	// separate-state

};



struct _state_node {

	struct my_ac_patt * list;

	uint32_t fail;

};



struct _state_build {

	struct input_list *valid_input_bfs;

	struct input_list *valid_input_bfs_last;

	uint16_t type;

	uint8_t from_char;

	struct _state_node state;

};



struct _state_build build_new[T_SIZE] = {};

static uint32_t new[T_SIZE] = {};          



static uint32_t base[T_SIZE] = {};

static uint32_t check[T_SIZE] = {};

static struct _state_node check_match[T_SIZE] = {};



static uint32_t old[T_SIZE] = {};       



static uint32_t dat_hash[65536] = {};       



static int my_input_enqueue(struct input_list **bfs, struct input_list **last, unsigned char n)

{

	struct input_list *new;



	new = (struct input_list *) malloc(sizeof(struct input_list));

	if(!new) {

		cli_errmsg("bfs_enqueue: Can't allocate memory for input_list\n");

		return CL_EMEM;

	}

	new->next = NULL;

	new->node = n;



	if(*last) {

		(*last)->next = new;

		*last = new;

	} else {

		*bfs = *last = new;

	}



	return CL_SUCCESS;

}



static int my_input_dequeue(struct input_list **bfs, struct input_list **last)

{

	struct input_list *lpt;

	int pt;



	if(!(lpt = *bfs)) {

		return -1;

	} else {

		*bfs = (*bfs)->next;

		pt = lpt->node;

		if(lpt == *last)

			*last = NULL;

		free(lpt);

		return pt;

	}

}



static int my_input_queue_foreach(struct input_list **bfs)

{

	struct input_list *lpt;

	int pt;



	if(!(lpt = *bfs)) {

		return -1;

	} else {

		*bfs = (*bfs)->next;

		pt = lpt->node;

		return pt;

	}

}



extern struct bitmap_state *get_bit_popcnt_next_state(struct bitmap_state *pcur_state, unsigned char a);

static int max_check_index = 0;

int my_dat_ac_build(struct my_matcher *root)

{

	int count1 =0;

	struct bfs_list *multi_bfs = NULL, *multi_bfs_last = NULL;

	struct bfs_list *sep_bfs = NULL, *sep_bfs_last = NULL;

	int res;

	uint8_t a;

	struct input_list *tmp;

	//init value------s : = 1 ; new(s) : = 1 ; queue := { s } ;

	int i;

	for(i=0;i<T_SIZE;i++)

		base[i] = 2;

	struct bitmap_state *bitmap_st = root->ac_root;

	struct bitmap_state *bitmap_st1;



	uint32_t s = bitmap_st->state_id , s1;

	new[s] = bitmap_st->state_id;

	old[bitmap_st->state_id] = s;

	//base[new[s]] = bitmap_st->state_id;

	my_bfs_enqueue(&multi_bfs, &multi_bfs_last, (void*)bitmap_st);

	/* Routine for multistates */

	while((bitmap_st = my_bfs_dequeue(&multi_bfs, &multi_bfs_last)) != (void*)queue_NULL) {

		s = (uint32_t)bitmap_st->state_id;

		printf("-----M-----%03d(%03d)----------\n", s, new[s]);

		/* Let s be the next state in queue */

abcd:

		tmp = build_new[s].valid_input_bfs;

		while((res = my_input_queue_foreach(&tmp)) != -1) {

			a = (uint8_t)res;

			if(check[base[new[s]] + a] != 0){

				base[new[s]] = base[new[s]] + 1;

				goto abcd;

			}

		}



		while((res = my_input_dequeue(&build_new[s].valid_input_bfs, &build_new[s].valid_input_bfs_last)) != -1) {

			/*for each a in list[s]*/

			a = (uint8_t)res;

//			while(check[base[new[s]] + a] != 0){

//				base[new[s]] = base[new[s]] + 1;

//			}

			check[base[new[s]] + a] = new[s];

			bitmap_st1 = get_bit_popcnt_next_state(bitmap_st, a);

			s1 = bitmap_st1->state_id;

			new[s1] = base[new[s]] + a;

			old[base[new[s]] + a] = s1;

			if(new[s1] > max_check_index){

				max_check_index = new[s1];

			}

			printf("o_stat %03d -%02x- new_stat %03d;;;;%d\n", s1, a, new[s1], ++count1);



			//build hash

			if(bitmap_st1->level == 2){

				uint8_t hash_str[2];

				hash_str[0] = build_new[s].from_char;

				hash_str[1] = build_new[s1].from_char;

				uint16_t hash_val = *(uint16_t*)hash_str;

				dat_hash[hash_val] = new[s1];

			}



//			check_match[new[s1]].fail = new[build_new[s1].state.fail];

//			check_match[new[s1]].list = build_new[s1].state.list;



			if(build_new[s1].type != T_SEP_ST){

				my_bfs_enqueue(&multi_bfs, &multi_bfs_last, (void*)bitmap_st1);

			}

			else {

				my_bfs_enqueue(&sep_bfs, &sep_bfs_last, (void*)bitmap_st1);

			}

		}

	}



	/* Routine for single state */

	while((bitmap_st = my_bfs_dequeue(&sep_bfs, &sep_bfs_last)) != queue_NULL) {

		s = (uint32_t)bitmap_st->state_id;

		printf("-----S-----%03d(%03d)----------\n", s, new[s]);

		while(build_new[s].type != T_TER_ST){

			a = (uint8_t)my_input_dequeue(&build_new[s].valid_input_bfs, &build_new[s].valid_input_bfs_last);

			while(check[base[new[s]] + a] != 0){

				base[new[s]] = base[new[s]] + 1;

			}

			check[base[new[s]] + a] = new[s];

			bitmap_st1 = get_bit_popcnt_next_state(bitmap_st, a);

			s1 = bitmap_st1->state_id;

			new[s1] = base[new[s]] + a;

			old[base[new[s]] + a] = s1;

			if(new[s1] > max_check_index){

				max_check_index = new[s1];

			}



			//build hash

			if(bitmap_st1->level == 2){

				uint8_t hash_str[2];

				hash_str[0] = build_new[s].from_char;

				hash_str[1] = build_new[s1].from_char;

				uint16_t hash_val = *(uint16_t*)hash_str;

				dat_hash[hash_val] = new[s1];

			}



			printf("o_stat %03d -%02x- new_stat %03d;;;;%d\n", s1, a, new[s1], ++count1);

//			check_match[new[s1]].fail = new[build_new[s1].state.fail];

//			check_match[new[s1]].list = build_new[s1].state.list;

			s = s1;

			bitmap_st = bitmap_st1;

		}

	}

	printf("max_check_index %d\n", max_check_index);

	for(i = 0;i<=max_check_index;i++)

	{

		check_match[i].fail = new[build_new[old[i]].state.fail];

		check_match[i].list = build_new[old[i]].state.list;

	}



	return 0;

}

static unsigned int fail_count[32] = {};

uint32_t my_dat_ac_fail_proc(unsigned char a, uint32_t s)

{

#if 0

//	return 1;

//	uint32_t t;

//	s = check_match[s].fail;

//	t = a + base[s];

//	if(check[t] != s)

//		return 1;

//	else

//		return t;



	static int tdfsdfasdf = 1;

	tdfsdfasdf++;

	return 1;

#else

	uint32_t t;

	int i = 0;

	do{

		fail_count[i++]++;

		if(s == 1) return s;

		s = check_match[s].fail;

		t = a + base[s];

	}while(check[t] != s);

	return t;

#endif

}



static uint32_t st_hash_fail_count = 0;

static uint32_t st_hash_ref_count = 0;

uint32_t my_dat_ac_fail_proc_hash(uint32_t s, uint8_t *buffer, uint32_t *i, uint32_t length)

{

	uint32_t t;

	int k = 0;

	uint32_t j;

	do{

		fail_count[k++]++;

		if(s == 1){//root

loop:

			j = *i;

			//hash search

			for(;j<length-1;j++)

			{

				st_hash_ref_count++;

				if(dat_hash[*(uint16_t*)(buffer+j)]){

					*i = j+1;

					return dat_hash[*(uint16_t*)(buffer+j)];

				}

				st_hash_fail_count++;

			}

			*i = j+1;

			return 0;

		}

		s = check_match[s].fail;

		if(s == 1)

			goto loop;

		t = buffer[*i] + base[s];

	}while(check[t] != s);

	return t;

}



int my_dat_ac_scanbuff(const unsigned char *buffer, uint32_t length,const struct my_matcher *root)

{

	int i;

//	for(i = 1;i< root->ac_nodes+1;i++)

//	{

//		//printf("old s %03d 's type is %d\n", i, build_new[i].type);

//		printf("old s %03d  new_stat is %d\n", i, new[i]);

//	}

//	for(i = 0;i< T_SIZE;i++)

//	{

//		if(check_match[i].fail == 0)

//		printf("stat %05d fail null[%d]\n", old[i], i);

//	}

//	fprintf(stdout, "trans num: %d\n", my_ac_trans_sum(root));

	//return 0;

	memset((void*)fail_count, 0x00, sizeof(fail_count));

	uint32_t s = 1, t;

	printf("my_dat_ac_scanbuff\n", i, build_new[i].type);

	int count = 0;

	for (i = 0; i < length; i++) {

		t = buffer[i] + base[s];

		if(check[t] != s){

			//fail

			//t = my_dat_ac_fail_proc(buffer[i], s);

			t = my_dat_ac_fail_proc_hash(s, buffer, &i, length);

			//continue;

		}

//		if(s == 1)

//			st_hash_ref_count++;

		printf("%05d  --%02x-->  %05d\n", old[s], buffer[i], old[t]);

		s=t;

		if(check_match[s].list){

			//final

			//fprintf(stdout, "hit:sigid %d:pos %d:\n", check_match[s].list->sigid, i);

			count++;

		}

	}

//	fprintf(stdout, "hit count %d, st_hash_fail_count %u st_hash_ref_count %u\n", count, st_hash_fail_count, st_hash_ref_count);

//	for(i = 0;i<8;i++)

//		fprintf(stdout, "%u,", fail_count[i]);

//	fprintf(stdout, "\n");

	return s;

}



int my_dat_update_input(uint32_t s, unsigned char a, uint32_t t)

{

	printf("id:%03d -- %02x -- id:%03d\n", s, a, t);

	my_input_enqueue(&build_new[s].valid_input_bfs, &build_new[s].valid_input_bfs_last, a);

	build_new[t].from_char = a;

	return 0;

}



int my_dat_update_patlist(uint32_t s, struct my_ac_patt *list)

{

	build_new[s].state.list = list;

	return 0;

}



int my_dat_update_fail(uint32_t s,uint32_t fail)

{

	build_new[s].state.fail = fail;

	return 0;

}



int my_dat_update_terminal(uint32_t s)

{

	build_new[s].type = T_TER_ST;

	return 0;

}



int my_dat_update_type(struct bitmap_state **st_array, uint16_t array_len)

{

	int i = array_len-1;



	if(build_new[st_array[i]->state_id].type !=0)

		return 0;



	int flag = 0;

	for(; i>=0;i--)

	{

		if(st_array[i]->sumary[_bitmap_group] > 1 || flag)

		{

			build_new[st_array[i]->state_id].type = T_MUL_ST;

			flag = 1;

		}

		else

			build_new[st_array[i]->state_id].type = T_SEP_ST;

	}

	return 0;

}







More information about the clamav-devel mailing list