#!/usr/bin/gawk -f

#this is based on training using MRE

BEGIN{
	FS=OFS=",";
	Ignore="?";
	TrainRecordsRead = 0;
	TestRecordsRead = 0;
	DoneOnce = 0;
	MinMreValue = 10^20;
	MinNeighborhood=0;
}

#using the train file, it will first find the MAX and MIN values in it
FILENAME==ARGV[1]{
	#initialize the max and min with the first record
	if (NR==1)
	{
		TotalFields=NF;
		for (fieldCounter=1; fieldCounter<=TotalFields; fieldCounter++)
		{
			TrainInput[NR,fieldCounter]=$fieldCounter;
			TrainMax[fieldCounter]=$fieldCounter;
			TrainMin[fieldCounter]=$fieldCounter;
		}
	}
	else
	{
		for (fieldCounter=1; fieldCounter<=TotalFields; fieldCounter++)
		{
			TrainInput[NR,fieldCounter]=$fieldCounter;
			if ($fieldCounter!=Ignore && TrainMax[fieldCounter]<$fieldCounter)
				TrainMax[fieldCounter]=$fieldCounter;
			if ($fieldCounter!=Ignore && TrainMin[fieldCounter]>$fieldCounter)
				TrainMin[fieldCounter]=$fieldCounter;
		}
	}
	TrainRecordsRead++;
}

#determining distances using the train file only
function calculateTrainDistance(){
	#find the distance of the core instance from all others (and not itself)
	for (trainCoreCounter=1; trainCoreCounter<=TrainRecordsRead; trainCoreCounter++)
	{
		for (trainInstanceCounter=1; trainInstanceCounter<=TrainRecordsRead; trainInstanceCounter++)
		{
			TrainDistance[trainCoreCounter,trainInstanceCounter] = 0;
			if (trainCoreCounter!=trainInstanceCounter)
			{
				for (fieldCounter=1; fieldCounter<=TotalFields; fieldCounter++)
				{
					if (TrainInput[trainInstanceCounter,fieldCounter]!=Ignore && TrainInput[trainCoreCounter,fieldCounter]!=Ignore)
						TrainDistance[trainCoreCounter,trainInstanceCounter]+=( (TrainInput[trainInstanceCounter,fieldCounter]-TrainInput[trainCoreCounter,fieldCounter]) / (TrainMax[fieldCounter]-TrainMin[fieldCounter]+10^(-20)) )^2;
				}
			}
		}
	}
}

#using the instances in the train file, each test instance gets its distances calculated
function calculateTestDistance(){
	#find the distance of each test instance from all train instances
	for (testInstanceCounter=1; testInstanceCounter<=TestRecordsRead; testInstanceCounter++)
	{
		for (trainInstanceCounter=1; trainInstanceCounter<=TrainRecordsRead; trainInstanceCounter++)
		{
			TestDistance[testInstanceCounter,trainInstanceCounter] = 0;
			#it does not use the actual effort since it should not be known in testing
			for (fieldCounter=1; fieldCounter<=TotalFields-1; fieldCounter++)
			{
				if (TrainInput[trainInstanceCounter,fieldCounter]!=Ignore && TestInput[testInstanceCounter,fieldCounter]!=Ignore)
					TestDistance[testInstanceCounter,trainInstanceCounter]+=( (TrainInput[trainInstanceCounter,fieldCounter]-TestInput[testInstanceCounter,fieldCounter]) / (TestMax[testInstanceCounter,fieldCounter]-TestMin[testInstanceCounter,fieldCounter]+10^(-20)) )^2;
			}
		}
	}
}

#sort each core instance's distances and find the best neighborhood that minimized the sum of mre's
function findTrainMre(){
	for (trainCoreCounter=1; trainCoreCounter<=TrainRecordsRead; trainCoreCounter++)
	{
		#prepare for sorting the array for each core instance
		for (trainInstanceCounter=1; trainInstanceCounter<=TrainRecordsRead; trainInstanceCounter++)
		{
			sortedTrainDistanceArray[trainInstanceCounter,1] = TrainDistance[trainCoreCounter,trainInstanceCounter];
			sortedTrainDistanceArray[trainInstanceCounter,2] = TrainInput[trainInstanceCounter,NF];
		}

		#sorting the array using insertion sort
		for (i = 1; i <= TrainRecordsRead; i++)
		{
			tempDistance = sortedTrainDistanceArray[i,1];
			tempEffort = sortedTrainDistanceArray[i,2];
			j = i;
			#the second part of the while loop condition assures that we sort on tempEffort as well when two or more records have equal distances
			#this allows for selection of smaller effort values when distances are equal in the neighborhood selection
			while ((j > 1) && (sortedTrainDistanceArray[j-1,1] > tempDistance || (sortedTrainDistanceArray[j-1,1]==tempDistance && sortedTrainDistanceArray[j-1,2]>tempEffort)))
#			while ((j > 1) && (sortedTrainDistanceArray[j-1,1] > tempDistance))
			{
				sortedTrainDistanceArray[j,1] = sortedTrainDistanceArray[j-1,1];
				sortedTrainDistanceArray[j,2] = sortedTrainDistanceArray[j-1,2];
				j = j - 1;
			}
			sortedTrainDistanceArray[j,1] = tempDistance;
			sortedTrainDistanceArray[j,2] = tempEffort;
		}

		#try every neighborhood bounded by the number of records in the training file
		for (neighborhoodCounter=2; neighborhoodCounter<=TrainRecordsRead; neighborhoodCounter++)
		{
			sum = 0;
			#take the average of the effort of the closest number of neighbors
			for (counter = 2; counter <= neighborhoodCounter; counter++)
			{
				sum+=sortedTrainDistanceArray[counter,2];
			}
			#it is important to note that neighborhood is actually neighborhoodCounter - 1 since neighborhoodCounter = 1 results in a neighborhood of 0 
			#which is not wanted here (hence the counters start from 2 in these loops).
			neighborhood = neighborhoodCounter - 1;
	
			#now average is the new estimate (and the core itself was counted in the neighborhood but it is not needed in finding the sum)
			estimate = sum/(neighborhood);

			#actual value is the value from Input array in the last field
			actual = TrainInput[trainCoreCounter,NF];
		
			#find the mre
			mre = (actual-estimate)/actual;
			mre = (mre < 0 ? -1*mre : mre);
			totalMre[neighborhood]+=mre;
		}
	}
}

#for each test instance, it uses the MinNeighborhood to find the estimate
function findTestEstimate(){
	for (testInstanceCounter=1; testInstanceCounter<=TestRecordsRead; testInstanceCounter++)
	{
		#prepare for sorting the array for each core instance
		for (trainInstanceCounter=1; trainInstanceCounter<=TrainRecordsRead; trainInstanceCounter++)
		{
			sortedTestDistanceArray[trainInstanceCounter,1] = TestDistance[testInstanceCounter,trainInstanceCounter];
			sortedTestDistanceArray[trainInstanceCounter,2] = TrainInput[trainInstanceCounter,NF];
		}

		#sorting the array using insertion sort
		for (i = 1; i <= TrainRecordsRead; i++)
		{
			tempDistance = sortedTestDistanceArray[i,1];
			tempEffort = sortedTestDistanceArray[i,2];
			j = i;
			#the second part of the while loop condition assures that we sort on tempEffort as well when two or more records have equal distances
			#this allows for selection of smaller effort values when distances are equal in the neighborhood selection
			while ((j > 1) && (sortedTestDistanceArray[j-1,1] > tempDistance || (sortedTestDistanceArray[j-1,1]==tempDistance && sortedTestDistanceArray[j-1,2]>tempEffort)))
#			while ((j > 1) && (sortedTestDistanceArray[j-1,1] > tempDistance))
			{
				sortedTestDistanceArray[j,1] = sortedTestDistanceArray[j-1,1];
				sortedTestDistanceArray[j,2] = sortedTestDistanceArray[j-1,2];
				j = j - 1;
			}
			sortedTestDistanceArray[j,1] = tempDistance;
			sortedTestDistanceArray[j,2] = tempEffort;
		}
	
		#print the first MinNeighborhoods' average from the sorted array 
		sum = 0;
		for (counter = 1; counter <= MinNeighborhood; counter++)
		{
			sum+=sortedTestDistanceArray[counter,2];
		}
		
		estimate = sum/(MinNeighborhood);
		actual = TestInput[testInstanceCounter,NF];
		
		print estimate, actual, MinNeighborhood;	
	}
}

#now that the training is done, it will do the testing with the instances in the second file
FILENAME==ARGV[2]{
	if (DoneOnce!=1)
	{
		calculateTrainDistance();
		findTrainMre();

		for (neighborhoodCounter=1; neighborhoodCounter<=TrainRecordsRead-1; neighborhoodCounter++)
		{
			totalMre[neighborhoodCounter] = totalMre[neighborhoodCounter]/TrainRecordsRead;
			if (totalMre[neighborhoodCounter] < MinMreValue)
			{
				MinMreValue = totalMre[neighborhoodCounter];
				MinNeighborhood = neighborhoodCounter;
			}
		}
		DoneOnce = 1;
		#reset NR
		NR = 1;
	}

	#read each test instance and determine the overall min and max for each test instances (individually) and all train instances (already determined)
	for (fieldCounter=1; fieldCounter<=TotalFields; fieldCounter++)
	{
		TestInput[NR,fieldCounter]=$fieldCounter;
		if (TrainMax[fieldCounter] > $fieldCounter)
			TestMax[NR,fieldCounter] = TrainMax[fieldCounter];
		else
			TestMax[NR,fieldCounter] = $fieldCounter;

		if (TrainMin[fieldCounter] < $fieldCounter)
			TestMin[NR,fieldCounter] = TrainMin[fieldCounter];
		else
			TestMin[NR,fieldCounter] = $fieldCounter;
	}
	TestRecordsRead++;

	#the MinNeighborhood was determined and can be used
}

END{
	calculateTestDistance();
	findTestEstimate();
}