#!/usr/bin/gawk -f

BEGIN {
	FS = OFS = ",";
	NotKnown = "?";
	NormalK = 10;
	ChunkK = 4;
	TooMuch = 2;
	TopBins = 2;
	Seed = 0;
	TotalFields = 6;

	OFMT="%10.10g"
}

END {
	init();
	for (era = 0; era <= 10; era++)
	{
		print "this is era #"era;
		findMinMax();
		initBins();
		fillBins();
		chunkBins();
		findBestBins();
		# clean the first (normal) layer of data which is only the Data array and get ready to put the new instances in
		delete Data;

		for (run = 1; run <= 1000; run++)
		{
			curveBestBins();


			a = selectedValue[1];
			b = selectedValue[2];
			c = selectedValue[3];
			d = selectedValue[4];
			e = selectedValue[5];
		
			srand(++Seed);

			#if these values are equal to "?" they were not determined from the curve. otherwise, they are cached and can be used	
			if (a == "?")
				a = x_a();
			if (b == "?")
				b = x_b();
			if (c == "?")
				c = x_c();
			if (d == "?")
				d = x_d();
			if (e == "?")
				e = x_e();

			score = (a^2/sin(sqrt(b)))*c*sin(d)*e;
			InputArray[0] = 6;
			InputArray[1] = a;
			InputArray[2] = b;
			InputArray[3] = c;
			InputArray[4] = d;
			InputArray[5] = e;
			InputArray[6] = score;
			addInstance(InputArray);

			print a,b,c,d,e,score;

		}
		
		# clean the second (chunk) layer of data which is all other arrays but the Data array and get ready for next chunking
		delete Max;
		delete Min;
		delete TotalBins;
		delete Breaks;
		delete Counts;
		delete newTotalBins;
		delete newBreaks;
		delete newCounts;
		delete bestBins;
		delete tempArray;
		delete sumArray;
		delete seenField;
		delete selectedValue;
	}
	print "best:"a,b,c,d,e,score;
}

function x_a() {return real(1,100)}
function x_b() {return radian() }
function x_c() {return real(1,100)}
function x_d() {return radian() }
function x_e() {return real(1,100)}

function radian()
{
	return rand()*2*1068966896/340262731;
}

function between(min,max)
{
	if (max == min) 
		return min;
	else
		return min+(max-min)*rand();
}

function real(min,max)
{
	return between(min,max);
}

function round(x)
{
	return int(x+0.5);
}

function init()
{
	for (counter = 1; counter <= 1000; counter++)
	{
		srand(++Seed);
		a = x_a();
		b = x_b();
		c = x_c();
		d = x_d();
		e = x_e();
		score = (a^2/sin(sqrt(b)))*c*sin(d)*e;
		InputArray[1] = a;
		InputArray[2] = b;
		InputArray[3] = c;
		InputArray[4] = d;
		InputArray[5] = e;
		InputArray[6] = score;
		addInstance(InputArray);
	}	
}

function addInstance(InputArray)
{
	totalInstances++;
	for (field = 1; field <= TotalFields; field++)
	{
		Data[totalInstances,field] = InputArray[field];

		#NOTE: this needs more attention
		numericFields[field] = 1;
	}
}

function findMinMax()
{
	for (field = 1; field <= TotalFields; field++)
	{
		if (numericFields[field] == 1)
		{
			Min[field] = Data[1,field];
			Max[field] = Data[1,field];
		}
	}
	for (instance = 1; instance <= totalInstances; instance++)
	{
		for (field = 1; field <= TotalFields; field++)
		{
			if (numericFields[field] == 1 && Data[instance,field] != NotKnown)
			{
				if (Data[instance,field] < Min[field])
					Min[field] = Data[instance,field];
				if (Data[instance,field] > Max[field])
					Max[field] = Data[instance,field];
			}
		}
	}
}

function initBins()
{
	for (field = 1; field <= TotalFields; field++)
	{
		if (numericFields[field] == 1)
		{
			TotalBins[field] = NormalK;
			for (binCounter = 1; binCounter <= TotalBins[field]; binCounter++)
			{
				Breaks[field,binCounter] = Min[field] + (binCounter - 1) * (Max[field] - Min[field])/TotalBins[field];
				Counts[field,binCounter] = 0;
			}
		}
	}
}

#IMPORTANT: less than min and greater than max bin ranges should be handled such that they change max and min
#NOTE: don't just create a new bin on the left and right by adjusting Min[field] and Max[field]. The breaks are determined using Min[field] and Max[field]

function findBin(field,value,inputBreaks,inputTotalBins)
{
	#this returns a bin number that contains the value passed to the function
	if (value <= inputBreaks[field,1])
		return 1;
	else if (value >= inputBreaks[field,inputTotalBins[field]])
		return inputTotalBins[field];
	else
	{
		#find the correct bin using bin chop algorithm
		leftBin = 1;
		rightBin = inputTotalBins[field];
		while (leftBin < rightBin)
		{
			middleBin = int((leftBin + rightBin) / 2);
			if (value > inputBreaks[field,middleBin])
				leftBin = middleBin + 1;
			else if (value < inputBreaks[field,middleBin])
				rightBin = middleBin;
			else
				return middleBin;
		}
		return leftBin - 1;
	}
}

function fillBins()
{
	for (instance = 1; instance <= totalInstances; instance++)
	{
		for (field = 1; field <= TotalFields; field++)
		{
			if (numericFields[field] == 1 && Data[instance,field] != NotKnown)
			{
				#increase the right bin's count
				bin = findBin(field,Data[instance,field],Breaks,TotalBins);
				Counts[field,bin]++;

				#if the count it too high (measure according the equation below), split the bin and make adjustments
#IMPORTANT: is this measure using instances in the bottom of both sides? If so, they cancel out! anny other measure?
				if (Counts[field,bin] > TooMuch*TotalBins[field])
				{
					#make a break point in the middle of the current bin (upper bound is the same as lower bound of the next bin if exists or Max)
					if (bin == TotalBins[field])
						newBreakPoint = (Breaks[field,bin] + Max[field])/2;
					else
						newBreakPoint = (Breaks[field,bin] + Breaks[field,bin+1]) / 2;

					#shift the bins one up
					for (binCounter = TotalBins[field]; binCounter > bin; binCounter--)
					{
						Breaks[field,binCounter+1] = Breaks[field,binCounter];
						Counts[field,binCounter+1] = Counts[field,binCounter];
					}
					#make adjustments (simply divide the number of counts between the new bin and the old bin)
					TotalBins[field]++;
					Breaks[field,bin+1] = newBreakPoint;
					Counts[field,bin+1] = Counts[field,bin] / 2;
					Counts[field,bin] = Counts[field,bin] / 2;
				}
			}
		}
	}
}

function chunkBins()
{
	#this puts the normal bins into chunked bins so that they can be categorized into a smaller number of bins
	for (field = 1; field <= TotalFields; field++)
	{
		if (numericFields[field] == 1)
		{
			newBinCounter = 1;
			newBreaks[field,1] = Breaks[field,1];
			newCounts[field,1] = Counts[field,1];
			for (binCounter = 2; binCounter <= TotalBins[field]; binCounter++)
			{
				#the limit for bin count is specified here. if it is greater than the limit, it will be placed in the next bin
				if (newCounts[field,newBinCounter] + Counts[field,binCounter] <= totalInstances / ChunkK)
					newCounts[field,newBinCounter] += Counts[field,binCounter];
				else
				{
					newBinCounter++;
					newBreaks[field,newBinCounter] = Breaks[field,binCounter];
					newCounts[field,newBinCounter] = Counts[field,binCounter];
				}
			}
			newTotalBins[field] = newBinCounter;
		}
	}
}

function findBestBins()
{
	#take the class field and take its top N bins's Breaks' lower bound as the lower bound for being in the best class value
	classField = TotalFields;
	bestBreak = newBreaks[classField,newTotalBins[classField]-TopBins+1];
	for (instance = 1; instance <= totalInstances; instance++)
	{
		#if the class value is above the best break, each field's bin in the whole instance should be in the best bin list
		if (Data[instance,classField] >= bestBreak)
		{
			best++;
			for (field = 1; field <= TotalFields; field++)
			{
				if (numericFields[field] == 1 && Data[instance,field] != NotKnown)
				{
					bin = findBin(field,Data[instance,field],newBreaks,newTotalBins);
					bestBins[field,bin]++;
				}
			}
		}
	}
}

function curveBestBins()
{
	srand(Seed);
	randomRatio = rand();

	tempCounter = 0;
	#sum up the counts in all best bins
	for (field = 1; field <= TotalFields-1; field++)
	{
		if (numericFields[field] == 1)
		{
			for (binCounter = 1; binCounter <= newTotalBins[field]; binCounter++)
			{
				tempCounter++;
				tempArray[tempCounter,1] = bestBins[field,binCounter];
				tempArray[tempCounter,2] = field;
				tempArray[tempCounter,3] = binCounter;
			}
		}
	}
	tempArraySize = tempCounter;

	#this is a very slow sort. Needs to be optimized later
	for (i = 1; i <= tempArraySize; i++)
	{
		tempCount = tempArray[i,1];
		tempField = tempArray[i,2];
		tempBinCounter = tempArray[i,3];
		j = i;

		while ((j > 1) && (tempArray[j-1,1] > tempCount))
		{
			tempArray[j,1] = tempArray[j-1,1];
			tempArray[j,2] = tempArray[j-1,2];
			tempArray[j,3] = tempArray[j-1,3];
			j = j - 1;
		}
		tempArray[j,1] = tempCount;
		tempArray[j,2] = tempField;
		tempArray[j,3] = tempBinCounter;
	}

	#generate the cumulative sum for each one
	for (tempCounter = 1; tempCounter <= tempArraySize; tempCounter++)
	{
		sumArray[tempCounter,1] = sumArray[tempCounter-1,1] + tempArray[tempCounter,1];
		sumArray[tempCounter,2] = tempArray[tempCounter,2];
		sumArray[tempCounter,3] = tempArray[tempCounter,3];
	}

	#this is used for reporting the results of best ranges of each field
	for (field = 1; field <= TotalFields-1; field++)
	{
		selectedValue[field] = "?";
		seenField[field] = 0;
	}

	#in this part, it selects the best range of each field (if above the threshold) or leave it as "?" to fill it in with the model later
	MinSumValue = sumArray[1,1];
	MaxSumValue = sumArray[tempArraySize,1];
	for (tempCounter = tempArraySize; tempCounter >= 1; tempCounter--)
	{
		if (sumArray[tempCounter,1] >= MinSumValue + (MaxSumValue - MinSumValue) * randomRatio && seenField[sumArray[tempCounter,2]]==0)
		{
			#only choose the top bin for each field and not the rest, even though it is above the threshold
			seenField[sumArray[tempCounter,2]] = 1;
			field = sumArray[tempCounter,2];
			bin = sumArray[tempCounter,3];
			
			lowerBound = newBreaks[field,bin];
			if (bin == newTotalBins[field])
				upperBound = Max[field];
			else
				upperBound = newBreaks[field,bin+1];

			srand(++Seed);
			randomRange = rand();
			selectedValue[field] = lowerBound + (upperBound - lowerBound) * randomRange;
		}
	}
#	for (field = 1; field <= TotalFields-2; field++)
#		printf ("%s,",selectedValue[field]);
#	print selectedValue[field];
}