/* pairmining.c
* 
* Copyright (C) 2005 2006 Toon Calders, Bart Goethals, Szymon Jaroszewicz
* 
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or (at
* your option) any later version.
* 
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
* General Public License for more details.
* 
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <limits.h>
#include <float.h>
#include <math.h>

#ifndef NAN
#define NAN (0.0/0.0)
//#define NAN sqrt(-1) //for windows!
#endif


#include "dataset.h"
#include "kdtree.h"
#include "range_tree.h"
#include "htable.h"
#include "eclat_items.h"

#define MAXLINE 10000


/////////////////////////////////////////////////
// Set the counting data structure/method here
/////////////////////////////////////////////////


//#define USE_BRUTE_FORCE
//#define USE_KDTREE
#define USE_RANGE_TREE
//#define USE_EXPLICIT_PAIRS		/* create all pairs explicitely, then use eclat */


/////////////////////////////////////////////////
// Set the support measure to use here
/////////////////////////////////////////////////

//#define SUPP_RHO
//#define SUPP_F
#define SUPP_TAU


char linebuf[MAXLINE];
size_t N;
//dataset * orig_data;
dataset * whole_data_lrank;
dataset * whole_data_hrank;

//new
size_t count, pruned;
typedef char M[1024];
M * mapping=NULL;
size_t map;

//new
void my_print_mapping(int num_items)
{
  printf("%d\n",num_items);
  size_t i;
  for (i=0;i<num_items;i++)
  {
      printf("%d => %s\n", i, mapping[i]);
  }
}

//new
void my_create_mapping(char *fn, size_t num_items)
{
     char linebuf[MAXLINE];
    char *line, *token1, *token2, *ptr;
    FILE *f = fopen(fn, "r");
    if(f == NULL)
    {
		fprintf(stderr, "error opening mapping file\n");
		exit(1);
    }

   mapping = (M *) malloc(num_items * sizeof(M));
   line = fgets(linebuf, MAXLINE, f);

   while(line != NULL && strlen(line) > 0)
   {
	if(line[strlen(line) - 1] = '\n')
	{
		line[strlen(line) - 1] = '\0';
	}
   	ptr = line;

    	token1 = strtok(ptr, " ");
    	token2 = strtok(NULL," ");
    //    printf("%s %s\n",token1, token2);	
	strcpy(mapping[atoi(token1)],token2);

	line = fgets(linebuf, MAXLINE, f);
    }

    fclose(f);
}


size_t
count_items(char * line, size_t * n_categ_items)
{
    char * lcopy = strdup(line); //copy line
    size_t n = 0;
    char * str = lcopy;
	//char * str = line;
    char * str2;
	
    *n_categ_items = 0;
    //str2 = strsep(&str, ",;");
	str2 = strtok(str, ",;");
    while(str2 != NULL)
    {
		if(isalpha(str2[0]))
		{
			(*n_categ_items)++;
		}
		n++;
		//str2 = strsep(&str, ",;");
		//str2 = strtok(str, ",;");
		str2 = strtok(NULL, ",;");
    }
	
    //free(lcopy);
    return n;
}

void
print_data(dataset * d)
{
    size_t i, j;
	
    for(i = 0; i < d->n; i++)
    {
		printf("%f", d->data[i][0]);
		for(j = 1; j < d->n_items; j++)
		{
			printf(",%f", d->data[i][j]);
		}
		printf("\n");
    }
}

//new
size_t get_opposite_item(size_t item)
{

	if (item%2 == 0) //if the item is even
		return item-1;
	else //odd
		return item+1;

}

//new
size_t get_num_attr(size_t item)
{

	if (item%2 == 0) //if the item is even
		return item/2;
	else //odd
		return (item+1)/2;

}
//new
double my_compute_upper_bound2(size_t *root, size_t lenroot, htable * freq)
{

   size_t s,u_bound=0;
   static size_t rootTmp[5000];

   rootTmp[0] = lenroot;

   for(s = 0; s < lenroot; s++)
	rootTmp[s+1] = root[s];

   /*printf("rootTmp: ["); 
   for(s = 0; s <= lenroot; s++)
   {		
	printf("%d",rootTmp[s]);
	if(s < lenroot )
		printf(",");
		
  }
  printf("]\n"); */

    for ( s=lenroot; s>1; s-- )
    {
	if (rootTmp[s]%2!=0) //negative
	{
		rootTmp[s]= get_opposite_item(rootTmp[s]);

		/*printf("testing: ["); 
		for(s = 0; s <= lenroot; s++)
		{		
			printf("%d",rootTmp[s]);
			if(s < lenroot )
				printf(",");
			
		}
		printf("]\n"); */

		 void *value = htable_get(freq, rootTmp);
		 if (value) 
			u_bound += *(size_t *) value;
		//printf("ub2 tmp %d\n", u_bound );
	}
   }
   //printf("ub2 %d\n", u_bound );
   return ( N*( N-1) )/2 - u_bound; 
}


//new
double my_compute_upper_bound( size_t *rootAttrPos, size_t *rootAttrNeg, size_t sz_pos, size_t sz_neg )
{

  size_t r, t, u_bound=0;
  size_t maxPos=0, minPos=0, maxNeg=0, minNeg=0;
  size_t maxdiffPos=0, maxdiffNeg=0;

  //printf("sz_pos = %d\n",sz_pos);
  //printf("sz_neg = %d\n",sz_neg);

  for(r = 0; r < whole_data_hrank->n; r++)
  {
        if (sz_pos>0)
	{
		maxPos = whole_data_hrank->data[r][rootAttrPos[0]];
	  	minPos = whole_data_lrank->data[r][rootAttrPos[0]];
	}

	if (sz_neg>0)
	{
		maxNeg = whole_data_hrank->data[r][rootAttrNeg[0]];
	  	minNeg = whole_data_lrank->data[r][rootAttrNeg[0]];
	}
	
	for(t = 1; t < sz_pos; t++)
	{
	    if(whole_data_hrank->data[r][rootAttrPos[t]] > maxPos)
	    {
		maxPos = whole_data_hrank->data[r][rootAttrPos[t]];
	    }
	
	    if(whole_data_lrank->data[r][rootAttrPos[t]] < minPos)
	    {
		minPos = whole_data_lrank->data[r][rootAttrPos[t]];
	    }
	}
	maxdiffPos = maxPos - minPos; 
	//printf("+maxdiffPos %d\n", maxdiffPos);

	for(t = 1; t < sz_neg; t++)
	{
	    if(whole_data_hrank->data[r][rootAttrNeg[t]] > maxNeg)
	    {
		maxNeg = whole_data_hrank->data[r][rootAttrNeg[t]];
	    }

	    if(whole_data_lrank->data[r][rootAttrNeg[t]] < minNeg)
	    {
		minNeg = whole_data_lrank->data[r][rootAttrNeg[t]];
	    }
	}
	maxdiffNeg = maxNeg - minNeg;  
	//printf("-maxdiffNeg %d\n", maxdiffNeg);
     
     if (maxdiffNeg > maxdiffPos) 
	   u_bound += maxdiffNeg;
     else
	   u_bound += maxdiffPos;
     
  }  //end for each transaction

  //printf("u bound tmp %d\n", u_bound);
  u_bound = ( N*(N-1) - u_bound ) / 2 ; 
  
  return u_bound;

}


//new
void my_eclat(size_t i, size_t n_items, dataset * dgreater, dataset * dless,
			 size_t minsup, size_t * root, size_t lenroot, htable * freq,
			 range_tree * rt, index_entry ** indices)
{
	
    size_t supp = 0;
    size_t u_bound = 1UL<<30;	/* upper bound on support_tau */
    size_t u_bound2 = 1UL<<30;	/* upper bound on support_tau */
	
    size_t r, s, t; 
	
    dataset * d_gt_new = dgreater; 		/* pruned dataset */
    dataset * d_lt_new = dless; 		/* pruned dataset */
	
    size_t n_lt, n_gt;
    size_t old_n_lt;
    static size_t prune_buf[5000];
    static size_t prune_buf_compl[5000];
    
    static Bound bounds[5000];
	
    if(i > n_items)
		return;
	
    if(lenroot >= 1)
    {
		/*printf("checking Itemset: [");
		for(s = 0; s < lenroot; s++)
		{		
			printf("%d",root[s]);
			if(s < lenroot - 1)
				printf(",");
			
		}
		printf("]***\n"); */
		
        // pruning (check subsets!!!)
		if(lenroot >= 2)
		{
			prune_buf[0] = lenroot - 1;
			prune_buf_compl[0] = lenroot - 1;

			for(r = 0; r < lenroot; r++)
			{
				t = 1;
				for(s = 0; s < lenroot; s++)
				{
					if(s != r)
					{
						prune_buf[t] = root[s];
						prune_buf_compl[t] = get_opposite_item(prune_buf[t]); 
						
						t++;
					}
				}
				
				/*printf(" Subset Complement: [");
				for(s = 1; s <= prune_buf_compl[0]; s++)
				{		
					printf("%d",prune_buf_compl[s]);
					if(s < prune_buf_compl[0])
						printf(",");
					
				}
				printf("]\n***\n"); */ 
				
				//check if the subset or the complement of the subset is frequent
				if(htable_get(freq, prune_buf) == NULL && htable_get(freq, prune_buf_compl) == NULL)
					return;
			}
		}
		
		d_gt_new = dataset_new();
		d_lt_new = dless;
		
		size_t * rootAttr;
		size_t * rootAttrPos, * rootAttrNeg;

		rootAttr = (size_t *) malloc (lenroot * sizeof(size_t));
		rootAttrPos = (size_t *) malloc (lenroot * sizeof(size_t));
		rootAttrNeg = (size_t *) malloc (lenroot * sizeof(size_t));
		
		r=0; t=0;
		for(s = 0; s < lenroot; s++)
		{
			rootAttr[s] = get_num_attr(root[s]);
			if (root[s]%2==0) //even
				rootAttrPos[r++] = rootAttr[s];
			else //odd
				rootAttrNeg[t++] = rootAttr[s];
		}
		
		/*printf(" Root Attr: [");
		for(s = 0; s < lenroot; s++)
		{		
			printf("%d",rootAttr[s]);
			if(s < lenroot - 1)
				printf(",");
		
		}
		printf("]***\n"); */
		
		//n_gt = dataset_put_nans_at_end(dgreater, rootAttr, lenroot);
		//my_range_tree_add_dim(rt, rootAttr[lenroot - 1], indices[rootAttr[lenroot-1]], dless->n, (root[lenroot-1]%2==0?1:0));

		/* prune using upper bound */
  		u_bound  = my_compute_upper_bound( rootAttrPos, rootAttrNeg, r, t);

	        //printf("t: %d\n",t);
		if (t > 0) //if there are negatives...
		{
	  		u_bound2 = my_compute_upper_bound2(root, lenroot, freq );
		}

	        free(rootAttrPos);
                free(rootAttrNeg);

		if (u_bound2 < u_bound)
			u_bound = u_bound2;

		if(u_bound < minsup)
		{
			/*printf("**(");
			for(s = 0; s < lenroot; s++)
			{		
				printf("%d",root[s]);
				if(s < lenroot - 1)
					printf(",");
			
			}
			printf(") pruned!!!, ub %d\n", u_bound ); */
			pruned++;
		}
		else
		{
			n_gt = dataset_put_nans_at_end(dgreater, rootAttr, lenroot);
			my_range_tree_add_dim(rt, rootAttr[lenroot - 1], indices[rootAttr[lenroot-1]], dless->n, (root[lenroot-1]%2==0?1:0));

			//printf("u_bound = %d\n",u_bound);
			supp = 0;
			for(r = 0; r < n_gt; r++) //for each row in the (pruned) dataset
			{
				size_t tmp;
				
				for(s = 0; s < lenroot; s++)
				{
					bounds[s].value = dgreater->data[r][rootAttr[s]];
					bounds[s].is_positive =  (root[s]%2==0?1:0);
				}

				/*printf(" Bounds: [");
				for(s = 0; s < lenroot; s++)
				{
					printf("%f (%d)", bounds[s].value, bounds[s].is_positive);
					if(s < lenroot - 1)
						printf(",");
				}
				printf("]\n"); */


				tmp = my_range_tree_get_count(rt, bounds, lenroot, 1);
				//printf("%d\n", tmp);
				supp += tmp;
				if(tmp > 0)
					dataset_append_record(d_gt_new, dgreater->data[r]);
			}
			//printf("Size of the dataset: %d\n", d_gt_new->n);
			
			//printf("supp = %d\n",supp);
			if(supp > u_bound)
				printf("!!!!!!!!!! wrong bound %d, %d!!!!!!!!!\n", supp, u_bound);
			
			if(supp >= minsup)
			{
				//if(lenroot != 1)
				{
					printf("[");
					for(s = 0; s < lenroot; s++)
					{
						//printf("%d", root[s]);
						if (map==1)
						{		
						//	printf("%d",root[s]);
							
							if (root[s]%2==0) //even
								printf("%s+",mapping[get_num_attr(root[s])]);
							else
								printf("%s-",mapping[get_num_attr(root[s])]);
						}	
						else
							printf("%d",root[s]);

						if(s < lenroot - 1)
							printf(", ");
					}
					//printf("] tau= %d ubound= %d F= %lf rho= %lf\n", supp, u_bound, F, rho);
					printf("] %d\n", supp);
					count++;
				}
				// insert new frequent set into a hashtable
				{
					size_t z;
					size_t * key = (size_t *) malloc((lenroot + 1) * sizeof(size_t));
					size_t * value = (size_t *) malloc(sizeof(size_t));
					
					key[0] = lenroot;
					for(z = 0; z < lenroot; z++)
					{
						key[z+1] = root[z];
					}
					*value = supp;
					htable_put(freq, key, value);
				}
			}
		}
		free(rootAttr);
    }
	
    /*if(lenroot < 1 || supp >= minsup) //enumeration of the patterns!!!!
    {
	size_t j;
	for(j = n_items; j > i; j--)
	{
	root[lenroot] = j-1;
	myEclat(j, n_items, d_gt_new, d_lt_new, minsup, root, lenroot + 1, freq, rt, indices);
	}
    } */
	
	if(lenroot < 1 || supp >= minsup)
	{
		int j;
		for(j = n_items; j > i; j--)
		{
			root[lenroot] = (j-1)*2;

			//dataset * d_gt_new_prefix = d_gt_new; /* pruned dataset */
		
			my_eclat(j, n_items, d_gt_new, d_lt_new, minsup, root, lenroot + 1, freq, rt, indices);

			if (lenroot > 0)
			{	
				/*printf(" processing negative (current:) [");
				for(s = 0; s < lenroot; s++)
				{
					printf("%d", root[s]);
					if(s < lenroot - 1)
						printf(", ");
				}
				printf("]...\n"); */

				root[lenroot] = root[lenroot] -1;

				my_eclat(j, n_items, d_gt_new, d_lt_new, minsup, root, lenroot + 1, freq, rt, indices);
			}
		}
	}
    
    if(lenroot >= 1)
    {
		dataset_free(d_gt_new, 0);
		range_tree_remove_dimension(rt, lenroot + 1);
    } 
}



#ifndef USE_RANGE_TREE
void
eclat(size_t i, size_t n_items, dataset * dgreater, dataset * dless,
      size_t minsup, size_t * root, size_t lenroot, htable * freq,
      size_t * mark_gt, size_t * mark_lt)
#endif
#ifdef USE_RANGE_TREE
	  void
	  eclat(size_t i, size_t n_items, dataset * dgreater, dataset * dless,
      size_t minsup, size_t * root, size_t lenroot, htable * freq,
      range_tree * rt, index_entry ** indices)
#endif
{
    size_t supp = 0;
    size_t u_bound = 1UL<<30;	/* upper bound on support_tau */
    size_t l_bound = 0;		/* lower bound on support_tau */
    double F = 0, rho = 0;	/* other definitions of support */
    size_t r, s, t; // u;
    dataset * d_gt_new = dgreater; 		/* pruned dataset */
    dataset * d_lt_new = dless; 		/* pruned dataset */
/*#ifdef USE_KDTREE
    KDtree kdtree_less;
    KDtree kdtree_gt;
#endif */
    size_t n_lt, n_gt;
    size_t old_n_lt;
    static size_t prune_buf[5000];
    static double bounds[5000];
	
    //print 'cand', root, len(Dcur)
    //for(s = 0; s < lenroot; s++)
    //	printf("%d,", root[s]);
	//printf("%d\n", dgreater->n);
    if(i > n_items)
		return;
	
    if(lenroot >= 1)
    {
        // pruning
		if(lenroot >= 2)
		{
			prune_buf[0] = lenroot - 1;
			for(r = 0; r < lenroot; r++)
			{
				t = 1;
				for(s = 0; s < lenroot; s++)
					if(s != r)
						prune_buf[t++] = root[s];
					if(htable_get(freq, prune_buf) == NULL)
						return;
			}
		}
		
#ifdef SUPP_TAU
		/* create pruned datasets */
/*#ifdef USE_BRUTE_FORCE
		d_gt_new = dataset_new();
		d_lt_new = dless;
#endif*/
#ifdef USE_RANGE_TREE
		d_gt_new = dataset_new();
		d_lt_new = dless;
#endif
/*#ifdef USE_KDTREE
		d_gt_new = dataset_new();
		d_lt_new = dataset_new();
#endif*/
		
		old_n_lt = dless->n;
/*#ifdef USE_BRUTE_FORCE
		n_gt = dataset_put_nans_at_end(dgreater, root, lenroot);
		n_lt = dataset_put_nans_at_end(dless, root, lenroot);
		dless->n = n_lt;
#endif
#ifdef USE_KDTREE
		// initialize the KDtree
		KDtree_init(&kdtree_less, dless, lenroot, root);
		KDtree_init(&kdtree_gt, dgreater, lenroot, root);
#endif /* USE_KDTREE */
#ifdef USE_RANGE_TREE
		n_gt = dataset_put_nans_at_end(dgreater, root, lenroot);
		range_tree_add_dim(rt, root[lenroot - 1], indices[root[lenroot-1]], dless->n);
#endif
		
		/* prune using upper bound */
		//if(lenroot == 200000)
		{
			u_bound = 0;
			for(r = 0; r < whole_data_hrank->n; r++)
			{
				double max = whole_data_hrank->data[r][root[0]];
				
				for(t = 1; t < lenroot; t++)
				{
					if(whole_data_hrank->data[r][root[t]] > max)
					{
						max = whole_data_hrank->data[r][root[t]];
					}
				}
				u_bound += N - max - 1;
			}
			for(r = 0; r < whole_data_lrank->n; r++)
			{
				double min = whole_data_lrank->data[r][root[0]];
				
				for(t = 1; t < lenroot; t++)
				{
					if(whole_data_lrank->data[r][root[t]] < min)
					{
						min = whole_data_lrank->data[r][root[t]];
					}
				}
				u_bound += min;
			}
			u_bound /= 2;
		}
		/* compute lower bound */
		//l_bound = orig_data->n * (orig_data->n - 1) / 2;
		//for(r = 0; r < orig_data->n; r++)
		//{
		//    double max = orig_data->data[r][root[0]];
		//    double min = orig_data->data[r][root[0]];
		//
		//    for(t = 1; t < lenroot; t++)
		//    {
		//	if(orig_data->data[r][root[t]] > max)
		//	{
		//	    max = orig_data->data[r][root[t]];
		//	}
		//	if(orig_data->data[r][root[t]] < min)
		//	{
		//	    min = orig_data->data[r][root[t]];
		//	}
		//    }
		//    l_bound -= (max - min);
		//}
#endif /* SUPP_TAU */
		
#if defined (SUPP_RHO) || defined (SUPP_F)
		/* rho and F */
		F = 0;
		rho = 0;
		for(r = 0; r < orig_data->n; r++)
		{
			double max = orig_data->data[r][root[0]];
			double min = orig_data->data[r][root[0]];
			
			for(t = 1; t < lenroot; t++)
			{
				if(orig_data->data[r][root[t]] > max)
				{
					max = orig_data->data[r][root[t]];
				}
				if(orig_data->data[r][root[t]] < min)
				{
					min = orig_data->data[r][root[t]];
				}
			}
			F += (max - min);
			rho += (max - min) * (max - min);
		}
#endif
		
#ifdef SUPP_TAU
		if(u_bound < minsup)
		{
			//if(lenroot > 2)
				printf("pruned!!!, length %d\n", lenroot);
		}
		else
#endif /* SUPP_TAU */
		{
#ifdef SUPP_TAU
			/* compute support */
#ifdef USE_BRUTE_FORCE
			supp = 0;
			for(r = 0; r < n_gt; r++)
			{
				size_t tmp;
				size_t tmp2;
				
				for(s = 0; s < lenroot; s++)
					bounds[s] = dgreater->data[r][root[s]];
				tmp2 = 0;
				for(s = 0; s < n_lt; s++)
				{
					size_t ok = 1;
					for(t = 0; t < lenroot; t++)
						if(isnan(bounds[t]) || isnan(dless->data[s][root[t]]) || bounds[t] <= dless->data[s][root[t]])
							ok = 0;
						tmp2 += ok;
				}
				supp += tmp2;
				if(tmp2 > 0)
					dataset_append_record(d_gt_new, dgreater->data[r]);
			}
#endif /* USE_BRUTE_FORCE */
			
#ifdef USE_RANGE_TREE
			supp = 0;
			for(r = 0; r < n_gt; r++) //for each transaction
			{
				size_t tmp;
				
				for(s = 0; s < lenroot; s++)
					bounds[s] = dgreater->data[r][root[s]];
				tmp = range_tree_get_count(rt, bounds, lenroot);
				//printf("%d\n", tmp);
				supp += tmp;
				if(tmp > 0)
					dataset_append_record(d_gt_new, dgreater->data[r]);
			}
#endif /* USE_RANGE_TREE */
			
#ifdef USE_KDTREE
			supp = KDtree_join(&kdtree_gt, &kdtree_less, mark_gt, mark_lt);
			// remove unused records
			for(u = 0; u < kdtree_gt.n; u++)
			{
				if(mark_gt[u])
					dataset_append_record(d_gt_new, kdtree_gt.data[u]);
			}
			for(u = 0; u < kdtree_less.n; u++)
			{
				if(mark_lt[u])
					dataset_append_record(d_lt_new, kdtree_less.data[u]);
			}
			//printf("%d,%d\n", d_lt_new->n, d_gt_new->n);
#endif /* USE_KDTREE */
			
#endif /* SUPP_TAU */
#ifdef SUPP_F
			F = (double)N*(N-1) - F;
			supp = F;
#endif
#ifdef SUPP_RHO
			rho = (double)N*(N-1)*(N-1) - rho;
			supp = rho / 1000000;
#endif
			
			
			//printf("lenroot %d, supp=%d, u_bound=%d\n", lenroot,supp, u_bound);
#ifdef SUPP_TAU
			if(supp > u_bound)
				printf("!!!!!!!!!! worng bound %d, %d!!!!!!!!!\n", supp, u_bound);
			//if(supp < minsup && u_bound>= minsup)
			//	printf("!!!!!!!!!! pruned by support %d, %d !!!!!!!!!\n", supp, u_bound);
#endif	     
			//printf("===> %d, [%d, %d] %f\n", supp, l_bound, u_bound, (double)(u_bound - l_bound) / l_bound);
			//supp = u_bound;
			if(supp >= minsup)
			{
				//if(lenroot != 1)
				{
					printf("[");
					for(s = 0; s < lenroot; s++)
					{
						printf("%d", root[s]);
						if(s < lenroot - 1)
							printf(", ");
					}
					//printf("] tau= %d ubound= %d F= %lf rho= %lf\n", supp, u_bound, F, rho);
					printf("] %d\n", supp);
					count++;
				}
				// insert new frequent set into a hashtable
				{
					size_t z;
					size_t * key = (size_t *) malloc((lenroot + 1) * sizeof(size_t));
					size_t * value = (size_t *) malloc(sizeof(size_t));
					
					key[0] = lenroot;
					for(z = 0; z < lenroot; z++)
					{
						key[z+1] = root[z];
					}
					*value = supp;
					htable_put(freq, key, value);
				}
			}
	}
    }
    if(lenroot < 1 || supp >= minsup) //enumeration of the patterns!!!!
    {
		size_t j;
		for(j = n_items; j > i; j--)
		{
			root[lenroot] = j-1;
			eclat(j, n_items, d_gt_new, d_lt_new, minsup, root, lenroot + 1, freq, rt, indices);
		}


//#ifdef USE_RANGE_TREE
//			eclat(j, n_items, d_gt_new, d_lt_new, minsup, root, lenroot + 1, freq, rt, indices);
//#else
//			eclat(j, n_items, d_gt_new, d_lt_new, minsup, root, lenroot + 1, freq, mark_gt, mark_lt);
//#endif
//		}
    }
#ifdef SUPP_TAU
    if(lenroot >= 1)
    {
#ifdef USE_KDTREE
		dataset_free(d_gt_new, 0);
		dataset_free(d_lt_new, 0);
#endif
#ifdef USE_BRUTE_FORCE
		dless->n = old_n_lt;
		dataset_free(d_gt_new, 0);
#endif
#ifdef USE_RANGE_TREE
		dataset_free(d_gt_new, 0);
#endif
    }
#ifdef USE_RANGE_TREE
    if(lenroot >= 1)
    {
		range_tree_remove_dimension(rt, lenroot+1);
    }
#endif
#endif /* SUPP_TAU */
}


static
size_t hashvect(void * v)
{
    size_t * vec = (size_t *)v;
    size_t hash = 0;
    size_t i;
	
    for(i = 0; i < vec[0]; i++)
		hash = (hash << 2) + vec[i+1];
    return hash;
}

static
int cmpvect(void * a, void * b)
{
    size_t * veca = (size_t *)a;
    size_t * vecb = (size_t *)b;
    size_t i;
	
    if(veca[0] != vecb[0])
		return 0;
    for(i = 0; i < veca[0]; i++)
		if(veca[i+1] != vecb[i+1])
			return 0;
		return 1;
}

static
size_t hasstring(void * v)
{
    char * str = (char *)v;
    size_t hash = 0;
    size_t i;
	
    for(i = 0; str[i] != '\0'; i++)
		hash = (hash << 2) + str[i];
    return hash;
}

static
int cmpstring(void * a, void * b)
{
    char * veca = (char *)a;
    char * vecb = (char *)b;
	
	// if(strcmp(a, b))
	if(strcmp(veca, vecb))
    {
		return 0;
    }
    return 1;
}

void
mine_freq(size_t n_items, dataset * dless, dataset * dgreater, size_t minsup)
{

#ifdef USE_RANGE_TREE
    index_entry ** indices = range_tree_create_indices(dless);
    range_tree * rt = range_tree_new(0.0);
#endif
	
    size_t root[5000];
    size_t i;
    htable * freq = htable_new(hashvect, cmpvect);
#ifdef USE_RANGE_TREE
    //eclat(0, n_items, dgreater, dless, minsup, root, 0, freq, rt, indices);
	my_eclat(0, n_items, dgreater, dless, minsup, root, 0, freq, rt, indices);

#elif defined USE_KDTREE || defined USE_BRUTE_FORCE
    size_t mark_gt[dgreater->n];
    size_t mark_lt[dless->n];
    eclat(0, n_items, dgreater, dless, minsup, root, 0, freq, mark_gt, mark_lt);
#endif
/*#ifdef USE_EXPLICIT_PAIRS
    tidlist ** itdl = make_item_tidlists(dgreater, dless);
    eclat_items(0, NULL, n_items, itdl, minsup, root, 0, freq);
    for(i = 0; i < n_items; i++)
    {
		tidlist_free(itdl[i]);
    }
    free(itdl);
#endif */
    htable_free(freq, 1, 1);
	
	
#ifdef USE_RANGE_TREE
    range_tree_free(rt);
    range_tree_free_indices(indices, n_items);
#endif
}

void
mine_freq_for_categ_attr(size_t * iset, int depth, 
						 htable ** categ_values,
						 dataset * categ_d_greater, dataset * d_greater,
						 dataset * categ_d_less, dataset * d_less,
						 size_t minsup)
{
    size_t c1 = 0, c2 = 1;
    size_t r, i;
    size_t max_supp;
    int last_attr;
    int attr_no;
	
	
    /* find all frequent in this class */
    if(depth > 0)
		mine_freq(d_less->n_items, d_less, d_greater, minsup);
	
	
    /* recursive call */
    if(depth == 0)
		last_attr = 0;
    else
		last_attr = iset[depth-1] + 1;
	
    //printf("%d,%d depth %d\n", last_attr, categ_d_less->n_items - 1, depth);
    for(attr_no = categ_d_less->n_items - 1; attr_no >= last_attr; attr_no--)
    {
		size_t n = categ_values[attr_no]->n;
		iset[depth] = attr_no;
		for(c1 = 0; c1 < n; c1++)
		{
			for(c2 = 0; c2 < n; c2++)
			{
				/* create databases for each pair attribute classes */
				dataset * dless_new = dataset_new();
				dataset * dgreater_new = dataset_new();
				dataset * categ_dless_new = dataset_new();
				dataset * categ_dgreater_new = dataset_new();
				dless_new->n_items = d_less->n_items;
				dgreater_new->n_items = d_greater->n_items;
				categ_dless_new->n_items = categ_d_less->n_items;
				categ_dgreater_new->n_items = categ_d_greater->n_items;
				for(r = 0; r < d_less->n; r++)
				{
					if((int)(categ_d_less->data[r][attr_no]) == c1)
					{
						dataset_append_record(dless_new, d_less->data[r]);
						dataset_append_record(categ_dless_new, categ_d_less->data[r]);
					}
				}
				for(r = 0; r < d_greater->n; r++)
				{
					if((int)(categ_d_greater->data[r][attr_no]) == c2)
					{
						dataset_append_record(dgreater_new, d_greater->data[r]);
						dataset_append_record(categ_dgreater_new, categ_d_greater->data[r]);
					}
				}
				printf("----------------- ");
				for(i = 0; i < depth+1; i++) printf("%d ", iset[i]);
				printf(", classes %d/%d -------------------\n", c1, c2);
				max_supp = dless_new->n * dgreater_new->n;
				if(max_supp >= minsup)
				{
					mine_freq_for_categ_attr(iset, depth + 1, categ_values,
						categ_dgreater_new, dgreater_new,
						categ_dless_new, dless_new,
						minsup);
					//mine_freq(d_less->n_items, dless_new, dgreater_new, minsup);
				}
				else
				{
					printf("datasets too small\n");
				}
				dataset_free(dless_new, 0);
				dataset_free(dgreater_new, 0);
				dataset_free(categ_dless_new, 0);
				dataset_free(categ_dgreater_new, 0);
			}
		}
    }
}

int
main(int argc, char ** argv)
{
    FILE * f;
    char * line;
    dataset * d = dataset_new();
    dataset * d_lrank;
    dataset * d_hrank;
    dataset * categ_d = dataset_new(); /* categorical columns of the dataset */
	
    size_t n_items;
    size_t n_categ_items;	/* assume categorical items come FIRST */
    size_t minsup;
	
    size_t i, j;
	
    if(argc < 3)
    {
		fprintf(stderr, "usage: pairmining file minsupport [mapping]\n");
		return 10;
    }
    minsup = atol(argv[2]); 
    f = fopen(argv[1], "r");
    if(f == NULL)
    {
		fprintf(stderr, "error opening file\n");
		return 10;
    }

	
    /* read data characteristics */
    line = fgets(linebuf, MAXLINE, f);

    n_items = count_items(linebuf, &n_categ_items);
    
    n_items -= n_categ_items;
    d->n_items = n_items;
    categ_d->n_items = n_categ_items;
    //printf("%d numeric items/ %d categorical\n", n_items, n_categ_items);
    /* create lists of values for categorical attributes */

    //new  
    map = 0;	
    if (argc == 4)
    {
	map = 1;
	my_create_mapping(argv[3],d->n_items);
	//printf("****%d\n",d->n_items);
	//my_print_mapping(d->n_items);
    }	
	
    htable ** categ_values = (htable**) malloc(n_categ_items * sizeof(htable *));
    for(i = 0; i < n_categ_items; i++)
    {
		categ_values[i] = htable_new(hasstring, cmpstring);
    }
	
    /* read data into memory */
    while(line != NULL && strlen(line) > 0)
    {
		double * record;
		double * categ_record;
		char * str;
		char * str2;
		
		if(line[strlen(line) - 1] = '\n')
		{
			line[strlen(line) - 1] = '\0';
		}
		categ_record = (double *) malloc(n_categ_items * sizeof(double));
		record = (double *) malloc(n_items * sizeof(double));
		str2 = line;
		//str = strsep(&str2, ",;");
		str = strtok(str2, ",;");
		j = 0;
		while(str != NULL)
		{
			if(j < n_categ_items)
			{
				if(str[0] == '\0' || !strcmp(str, "?"))
				{
					categ_record[j] = NAN;
				}
				else
				{
					void * cat_no = htable_get(categ_values[j], str);
					if(cat_no == NULL)//add value to list
					{
						cat_no = (size_t *) malloc(sizeof(size_t));
						*((size_t *)cat_no) = categ_values[j]->n;
						htable_put(categ_values[j], strdup(str), cat_no);
						//printf("%s = class %ld\n", str, *((size_t *)cat_no));
					}
					categ_record[j] = (float)(*(size_t *)cat_no);
				}
			}
			else
			{
				if(str[0] == '\0' || !strcmp(str, "?"))
				{
					record[j - n_categ_items] = NAN;
				}
				else
				{
					record[j - n_categ_items] = atof(str);
				}
			}
			//str = strsep(&str2, ",;");
			str = strtok(NULL, ",;");
			j++;
		}
		/* append record to data */
		dataset_append_record(d, record);
		dataset_append_record(categ_d, categ_record);
		line = fgets(linebuf, MAXLINE, f);
    }
    fclose(f);
    N = d->n;			/* store initial database size (number of rows)*/

	
    //print_data(d); 
    /* print_data(categ_d); */
	
    /* compute ranks */
    d_hrank = dataset_clone(d, 1);
    d_lrank = dataset_clone(d, 1);
    convert_to_ranks(d, TIES_AVG);
    convert_to_ranks(d_hrank, TIES_MAX);
    convert_to_ranks(d_lrank, TIES_MIN);
//    orig_data = d;
    whole_data_hrank = d_hrank; //ties max
    whole_data_lrank = d_lrank; //ties min

   // printf("high rank\n");
   // print_data(d_hrank);

   // printf("lower rank\n");
   // print_data(d_lrank);
 
    count=0; pruned=0;
   
    if(n_categ_items == 0)
    {
		//printf("Only numerical attributes\n");
		mine_freq(d->n_items, d, d, minsup);
    }
    else
    {
		//size_t iset[n_categ_items];
		size_t *iset;
		iset =	(size_t *) malloc(n_categ_items * sizeof(size_t));
		mine_freq_for_categ_attr(iset, 0, categ_values, categ_d, d, categ_d, d, minsup);
		free(iset);
    }
	
    /* free data */
    for(i = 0; i < n_categ_items; i++)
    {
		htable_free(categ_values[i], 1, 1);
    }

    free(categ_values);
    dataset_free(categ_d, 1);
    dataset_free(d_lrank, 1);
    dataset_free(d_hrank, 1);
    dataset_free(d, 1);

    //new	
    if (map==1)
	    free(mapping); 
    printf("number of patterns: %d\n", count);    
    printf("***pruned: %d\n", pruned);
	
    return 0;
}
