/* dataset.c
 * 
 * Copyright (C) 2005 2006 Toon Calders, Bart Goethals, Szymon Jaroszewicz
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or (at
 * your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

#include <stdlib.h>

#include "dataset.h"


dataset *
dataset_new(void)
{
    dataset * d = malloc(sizeof(dataset));

    if(d != NULL)
    {
	d->data = NULL;
	d->n = 0;
	d->alloc_size = 0;
    }
    return d;
}

void
dataset_free(dataset * d, int free_records)
{
    size_t i;

    if(free_records)
    {
	for(i = 0; i < d->n; i++)
	{
	    free(d->data[i]);
	}
    }
    if(d->alloc_size > 0)
    {
	free(d->data);
    }
    free(d);
}



/* append record to data */
int
dataset_append_record(dataset * d, double * record)
{
    if(d->n >= d->alloc_size)
    {
	/* reallocate data */
	if(d->alloc_size == 0)
	{
	    d->data = malloc(1024 * sizeof(double *));
	    d->alloc_size = 1024;
	}
	else
	{
	    d->alloc_size *= 2;
	    d->data = realloc(d->data, d->alloc_size * sizeof(double *));
	}
    }
    if(d->data == NULL)
	return 0;
    d->data[d->n++] = record;
    return 1;
}

/* move records with NANs in specified attrs to the end.
   Returns number of records without NANs */
size_t
dataset_put_nans_at_end(dataset * d, size_t * attrs, size_t nattrs)
{
    size_t i, j, k;
    double * tmp;
    int has_nans = 0;

    i = 0;
    j = d->n - 1;

    while(i < j)
    {
	has_nans = 0;
	for(k = 0; k < nattrs; k++)
	{
	    if(isnan(d->data[i][attrs[k]]))
	    {
		has_nans = 1;
		break;
	    }
	}
	if(has_nans)
	{
	    tmp = d->data[i];
	    d->data[i] = d->data[j];
	    d->data[j] = tmp;
	    j--;
	}
	else
	    i++;
    }
    has_nans = 0;
    for(k = 0; k < nattrs; k++)
    {
	if(isnan(d->data[i][attrs[k]]))
	{
	    has_nans = 1;
	    break;
	}
    }
    if(!has_nans) i++;
    return i;
}

dataset *
dataset_clone(dataset * d, int clone_records)
{
    dataset * new_d = dataset_new();
    size_t i, j;
    double * new_r;

    new_d->n_items = d->n_items;
    for(i = 0; i < d->n; i++)
    {
	if(!clone_records)
	{
	    new_r = d->data[i];
	}
	else
	{
	    new_r = malloc(new_d->n_items * sizeof(double));
	    for(j = 0; j < new_d->n_items; j++)
	    {
		new_r[j] = d->data[i][j];
	    }
	}
	dataset_append_record(new_d, new_r);
    }

    return new_d;
}


/* structure used for index sort */
struct val_index
{
    int index;
    double val;
    double rank;
};

static int
comp_on_val(const void * a_p, const void * b_p)
{
    const struct val_index * a = a_p;
    const struct val_index * b = b_p;

    if(isnan(a->val) && isnan(b->val)) return 0;
    if(isnan(a->val)) return 1;
    if(isnan(b->val)) return -1;
    return (a->val > b->val) - (a->val < b->val);
}

static double compute_rank(size_t seqstart, size_t i, int mode)
{
    double rank;

    switch(mode)
    {
    case TIES_AVG:
	rank = seqstart + (double)(i - 1 - seqstart) / 2;
	break;
    case TIES_MIN:
	rank = seqstart;
	break;
    case TIES_MAX:
	rank = i - 1;
	break;
    };
    return rank;
}

void
convert_to_ranks(dataset * d, int mode)
{
    struct val_index * column;
    int attr;
    int i;
    double old_x;
    size_t seqstart;
    double rank;

    column = malloc(d->n * sizeof(struct val_index));
    for(attr = 0; attr < d->n_items; attr++)
    {
	for(i = 0; i < d->n; i++)
	{
	    column[i].index = i;
	    column[i].val = d->data[i][attr];
	}
	qsort(column, d->n, sizeof(struct val_index), comp_on_val);

	old_x = column[0].val;
	seqstart = 0;
	for(i = 1; i < d->n; i++)
	{
	    double x = column[i].val;
	    if(x != old_x)
	    {
		rank = compute_rank(seqstart, i, mode);
		for(;seqstart < i; seqstart++)
		{
		    column[seqstart].rank = rank;
		}
		old_x = x;
	    }
	}
	rank = compute_rank(seqstart, i, mode);
	for(;seqstart < i; seqstart++)
	{
	    column[seqstart].rank = rank;
	}
	// fix NaNs
	for(i = 0; i < d->n; i++)
	{
	    if(isnan(column[i].val)) column[i].rank = column[i].val;
	}
	for(i = 0; i < d->n; i++)
	{
	    //d->data[column[i].index][attr] = (double)i;  // ignore ties
	    d->data[column[i].index][attr] = column[i].rank;  // average rank of ties
	}
    }
    free(column);
}
