#!/usr/bin/gawk -f

#This is a row pruning method. 
#It uses a train file to find the best neighborhood for the train instances (and uses LC to do the learning and find the estimates).
#It then uses the best neighborhood, and for each test instance, it applies the best neighborhood to the train file and uses LC to find the estimate for each.

BEGIN{
	FS=OFS=",";
	Ignore="?";
	TrainRecordsRead = 0;
	TestRecordsRead = 0;
	DoneOnce = 0;
	MinMreValue = 10^20;
	MinNeighborhood=0;
	E = 2.71828182846;
}

#using the train file, it will first find the MAX and MIN values in it
FILENAME==ARGV[1]{
	#initialize the max and min with the first record
	if (NR==1)
	{
		TotalFields=NF;
		for (fieldCounter=1; fieldCounter<=TotalFields; fieldCounter++)
		{
			TrainInput[NR,fieldCounter]=$fieldCounter;
			TrainMax[fieldCounter]=$fieldCounter;
			TrainMin[fieldCounter]=$fieldCounter;
		}
	}
	else
	{
		for (fieldCounter=1; fieldCounter<=TotalFields; fieldCounter++)
		{
			TrainInput[NR,fieldCounter]=$fieldCounter;
			if ($fieldCounter!=Ignore && TrainMax[fieldCounter]<$fieldCounter)
				TrainMax[fieldCounter]=$fieldCounter;
			if ($fieldCounter!=Ignore && TrainMin[fieldCounter]>$fieldCounter)
				TrainMin[fieldCounter]=$fieldCounter;
		}
	}
	TrainRecordsRead++;
}

#determining distances using the train file only
function calculateTrainDistance(){
	#find the distance of the core instance from all others (and not itself)
	for (trainCoreCounter=1; trainCoreCounter<=TrainRecordsRead; trainCoreCounter++)
	{
		for (trainInstanceCounter=1; trainInstanceCounter<=TrainRecordsRead; trainInstanceCounter++)
		{
			TrainDistance[trainCoreCounter,trainInstanceCounter] = 0;
			if (trainCoreCounter!=trainInstanceCounter)
			{
				for (fieldCounter=1; fieldCounter<=TotalFields; fieldCounter++)
				{
					if (TrainInput[trainInstanceCounter,fieldCounter]!=Ignore && TrainInput[trainCoreCounter,fieldCounter]!=Ignore)
						TrainDistance[trainCoreCounter,trainInstanceCounter]+=( (TrainInput[trainInstanceCounter,fieldCounter]-TrainInput[trainCoreCounter,fieldCounter]) / (TrainMax[fieldCounter]-TrainMin[fieldCounter]+10^(-20)) )^2;
				}
			}
		}
	}
}

#sort each core instance's distances and find the best neighborhood that minimized the mean of mre's
function findTrainMre(){
	for (trainCoreCounter=1; trainCoreCounter<=TrainRecordsRead; trainCoreCounter++)
	{
		#prepare for sorting the array for each core instance
		for (trainInstanceCounter=1; trainInstanceCounter<=TrainRecordsRead; trainInstanceCounter++)
		{
			sortedTrainDistanceArray[trainInstanceCounter,1] = TrainDistance[trainCoreCounter,trainInstanceCounter];
			sortedTrainDistanceArray[trainInstanceCounter,2] = TrainInput[trainInstanceCounter,NF];
			sortedTrainDistanceArray[trainInstanceCounter,3] = trainInstanceCounter;
		}

		#sorting the array using insertion sort
		for (i = 1; i <= TrainRecordsRead; i++)
		{
			tempDistance = sortedTrainDistanceArray[i,1];
			tempEffort = sortedTrainDistanceArray[i,2];
			tempOriginalCounter = sortedTrainDistanceArray[i,3];
			j = i;
			
			while ((j > 1) && (sortedTrainDistanceArray[j-1,1] > tempDistance))
			{
				sortedTrainDistanceArray[j,1] = sortedTrainDistanceArray[j-1,1];
				sortedTrainDistanceArray[j,2] = sortedTrainDistanceArray[j-1,2];
				sortedTrainDistanceArray[j,3] = sortedTrainDistanceArray[j-1,3];
				j = j - 1;
			}
			sortedTrainDistanceArray[j,1] = tempDistance;
			sortedTrainDistanceArray[j,2] = tempEffort;
			sortedTrainDistanceArray[j,3] = tempOriginalCounter;
		}

		#try every neighborhood bounded by the number of records in the training file
		for (neighborhoodCounter=2; neighborhoodCounter<=TrainRecordsRead; neighborhoodCounter++)
		{
			TestingCount = 0;
			TrainingCount = 0;
			Sum1 = 0;
			Sum2 = 0;
			Sum3 = 0;
			Sum4 = 0;
		
			#take the closest neighbor instances and run LC on them
			for (counter = 2; counter <= neighborhoodCounter; counter++)
			{
				#to be able to use the values of each field, the original counter should be used as the instance counter
				originalCounter = sortedTrainDistanceArray[counter,3];

				#LC algorithm pass 1 on training data

				# Initialize the effort multiplier.
				EffortMultiplier = 1;
			
				# For the fields after the scalar factors and before the last two...
				for(I=1; I<=(NF-2); I++)
					EffortMultiplier *= TrainInput[originalCounter,I];
			
				# Get the count for lines of code in the thousands.
				Kloc = TrainInput[originalCounter,NF-1];	
				# Get the actual recorded amount of effort in worker months.
				ActualEffort = TrainInput[originalCounter,NF];

				#run LC on the nearest instances to find the estimate
				TrainingCount++;

				# the following test assures that the kloc is not repeating. In some cases, repeating klocs result in a division by zero in A's calculation.
				if (arrayKlocTrain[Kloc] == 1)
					Kloc += 10^(-20);
				arrayKlocTrain[Kloc] = 1;
			
				#calculate the sum values	
				Sum1 += log(Kloc);
				Sum2 += (log(Kloc) * log(Kloc));
				Sum3 += log (ActualEffort / EffortMultiplier);
				Sum4 += (log(Kloc) * log (ActualEffort / EffortMultiplier));
			}

			#LC algorithm pass 2 on the test instance (taken from the train file)
			
			# Initialize the effort multiplier.
			EffortMultiplier = 1;
			
			# For the fields after the scalar factors and before the last two...
			for(I=1; I<=(NF-2); I++)
				EffortMultiplier *= TrainInput[trainCoreCounter,I];
			
			# Get the count for lines of code in the thousands.
			Kloc = TrainInput[trainCoreCounter,NF-1];	
			# Get the actual recorded amount of effort in worker months.
			ActualEffort = TrainInput[trainCoreCounter,NF];

			TestingCount++;
			if(TrainingCount>1)
			{
				# Immediately on entering Pass 2 calculate the A and B values from the information retained from Pass 1
			 	if (TrainingCount*Sum2 - Sum1*Sum1 == 0)
				{	
					A = ( Sum2*Sum3  - Sum1*Sum4 ) / ( TrainingCount*Sum2 - Sum1*Sum1 + 10^(-20));
					B = ( TrainingCount*Sum4 - Sum1*Sum3 ) / ( TrainingCount*Sum2 - Sum1*Sum1 + 10^(-20));
				}
				else
				{
					A = ( Sum2*Sum3  - Sum1*Sum4 ) / ( TrainingCount*Sum2 - Sum1*Sum1 );
					B = ( TrainingCount*Sum4 - Sum1*Sum3 ) / ( TrainingCount*Sum2 - Sum1*Sum1 );
				}
			}
			
			ScaleFactor = 0;

			# Calculate the estimated effort using Boehm's Cocomo formula and then calculate some statistics
			EstimatedEffort = ((B + ScaleFactor) * log(Kloc)) + A + log(EffortMultiplier);
			EstimatedEffort = E ^ EstimatedEffort;
		
			#it is important to note that neighborhood is actually neighborhoodCounter - 1 since neighborhoodCounter = 1 results in a neighborhood of 0 
			#which is not wanted here (hence the counters start from 2 in these loops).
			neighborhood = neighborhoodCounter - 1;

			estimate = EstimatedEffort;

			#actual value is the value from Input array in the last field
			actual = TrainInput[trainCoreCounter,NF];
		
			#find the mre
			mre = (actual-estimate)/actual;
			mre = (mre < 0 ? -1*mre : mre);

			totalMre[neighborhood] += mre;
		}
	}
}

#now that the training is done, it will do the testing with the instances in the second file
FILENAME==ARGV[2]{
	if (DoneOnce!=1)
	{
		calculateTrainDistance();
		findTrainMre();

		for (neighborhoodCounter=1; neighborhoodCounter<=TrainRecordsRead-1; neighborhoodCounter++)
		{
			totalMre[neighborhoodCounter] = totalMre[neighborhoodCounter]/TrainRecordsRead;
			if (totalMre[neighborhoodCounter] < MinMreValue)
			{
				MinMreValue = totalMre[neighborhoodCounter];
				MinNeighborhood = neighborhoodCounter;
			}
		}
		DoneOnce = 1;
		#reset NR
		NR = 1;
	}

	#read each test instance and determine the overall min and max for each test instances (individually) and all train instances (already determined)
	for (fieldCounter=1; fieldCounter<=TotalFields; fieldCounter++)
	{
		TestInput[NR,fieldCounter]=$fieldCounter;
		if (TrainMax[fieldCounter] > $fieldCounter)
			TestMax[NR,fieldCounter] = TrainMax[fieldCounter];
		else
			TestMax[NR,fieldCounter] = $fieldCounter;

		if (TrainMin[fieldCounter] < $fieldCounter)
			TestMin[NR,fieldCounter] = TrainMin[fieldCounter];
		else
			TestMin[NR,fieldCounter] = $fieldCounter;
	}
	TestRecordsRead++;

	#the MinNeighborhood was determined and can be used
}

#using the instances in the train file, each test instance gets its distances calculated
function calculateTestDistance(){
	#find the distance of each test instance from all train instances
	for (testInstanceCounter=1; testInstanceCounter<=TestRecordsRead; testInstanceCounter++)
	{
		for (trainInstanceCounter=1; trainInstanceCounter<=TrainRecordsRead; trainInstanceCounter++)
		{
			TestDistance[testInstanceCounter,trainInstanceCounter] = 0;
			#it does not use the actual effort since it should not be known in testing
			for (fieldCounter=1; fieldCounter<=TotalFields-1; fieldCounter++)
			{
				if (TrainInput[trainInstanceCounter,fieldCounter]!=Ignore && TestInput[testInstanceCounter,fieldCounter]!=Ignore)
					TestDistance[testInstanceCounter,trainInstanceCounter]+=( (TrainInput[trainInstanceCounter,fieldCounter]-TestInput[testInstanceCounter,fieldCounter]) / (TestMax[testInstanceCounter,fieldCounter]-TestMin[testInstanceCounter,fieldCounter]+10^(-20)) )^2;
			}
		}
	}
}

#sort each core instance's distances and find the best neighborhood that minimized the sum of mre's
function findTestEstimate(){
	for (testInstanceCounter=1; testInstanceCounter<=TestRecordsRead; testInstanceCounter++)
	{
		#prepare for sorting the array for each core instance
		for (trainInstanceCounter=1; trainInstanceCounter<=TrainRecordsRead; trainInstanceCounter++)
		{
			sortedTestDistanceArray[trainInstanceCounter,1] = TestDistance[testInstanceCounter,trainInstanceCounter];
			sortedTestDistanceArray[trainInstanceCounter,2] = TrainInput[trainInstanceCounter,NF];
			sortedTestDistanceArray[trainInstanceCounter,3] = trainInstanceCounter;
		}

		#sorting the array using insertion sort
		for (i = 1; i <= TrainRecordsRead; i++)
		{
			tempDistance = sortedTestDistanceArray[i,1];
			tempEffort = sortedTestDistanceArray[i,2];
			tempOriginalCounter = sortedTestDistanceArray[i,3];
			j = i;
			
			while ((j > 1) && (sortedTestDistanceArray[j-1,1] > tempDistance))
			{
				sortedTestDistanceArray[j,1] = sortedTestDistanceArray[j-1,1];
				sortedTestDistanceArray[j,2] = sortedTestDistanceArray[j-1,2];
				sortedTestDistanceArray[j,3] = sortedTestDistanceArray[j-1,3];
				j = j - 1;
			}
			sortedTestDistanceArray[j,1] = tempDistance;
			sortedTestDistanceArray[j,2] = tempEffort;
			sortedTestDistanceArray[j,3] = tempOriginalCounter;
		}

		#print the first MinNeighborhoods' estimate using LC from the sorted array 
		TestingCount = 0;
		TrainingCount = 0;
		Sum1 = 0;
		Sum2 = 0;
		Sum3 = 0;
		Sum4 = 0;
		
		#take the closest neighbor instances and run LC on them
		for (counter = 2; counter <= MinNeighborhood; counter++)
		{
			#to be able to use the values of each field, the original counter should be used as the instance counter
			originalCounter = sortedTestDistanceArray[counter,3];

			#LC algorithm pass 1 on training data

			# Initialize the effort multiplier.
			EffortMultiplier = 1;

			# For the fields after the scalar factors and before the last two...
			for(I=1; I<=(NF-2); I++)
				EffortMultiplier *= TrainInput[originalCounter,I];

			# Get the count for lines of code in the thousands.
			Kloc = TrainInput[originalCounter,NF-1];	
			# Get the actual recorded amount of effort in worker months.
			ActualEffort = TrainInput[originalCounter,NF];

			#run LC on the nearest instances to find the estimate
			TrainingCount++;

			# the following test assures that the kloc is not repeating. In some cases, repeating klocs result in a division by zero in A's calculation.
			if (arrayKlocTest[Kloc] == 1)
				Kloc += 10^(-20);
			arrayKlocTest[Kloc] = 1;

			#calculate the sum values	
			Sum1 += log(Kloc);
			Sum2 += (log(Kloc) * log(Kloc));
			Sum3 += log (ActualEffort / EffortMultiplier);
			Sum4 += (log(Kloc) * log (ActualEffort / EffortMultiplier));
		}

		#LC algorithm pass 2 on the test instance (taken from the test file)

		# Initialize the effort multiplier.
		EffortMultiplier = 1;

		# For the fields after the scalar factors and before the last two...
		for(I=1; I<=(NF-2); I++)
			EffortMultiplier *= TestInput[testInstanceCounter,I];

		# Get the count for lines of code in the thousands.
		Kloc = TestInput[testInstanceCounter,NF-1];	
		# Get the actual recorded amount of effort in worker months.
		ActualEffort = TestInput[testInstanceCounter,NF];

		TestingCount++;
		if(TrainingCount>1)
		{
			# Immediately on entering Pass 2 calculate the A and B values from the information retained from Pass 1
			if (TrainingCount*Sum2 - Sum1*Sum1 == 0)
			{	
				A = ( Sum2*Sum3  - Sum1*Sum4 ) / ( TrainingCount*Sum2 - Sum1*Sum1 + 10^(-20));
				B = ( TrainingCount*Sum4 - Sum1*Sum3 ) / ( TrainingCount*Sum2 - Sum1*Sum1 + 10^(-20));
			}
			else
			{
				A = ( Sum2*Sum3  - Sum1*Sum4 ) / ( TrainingCount*Sum2 - Sum1*Sum1 );
				B = ( TrainingCount*Sum4 - Sum1*Sum3 ) / ( TrainingCount*Sum2 - Sum1*Sum1 );
			}
		}

		ScaleFactor = 0;

		# Calculate the estimated effort using Boehm's Cocomo formula and then calculate some statistics
		EstimatedEffort = ((B + ScaleFactor) * log(Kloc)) + A + log(EffortMultiplier);
		EstimatedEffort = E ^ EstimatedEffort;

		estimate = EstimatedEffort;

		#actual value is the value from Input array in the last field
		actual = TestInput[testInstanceCounter,NF];

		print estimate, actual, MinNeighborhood;
	}
}

END{
	calculateTestDistance();
	findTestEstimate();
}