#!/usr/bin/gawk -f

########################################################################
#  dynamic locomo : row pruning method using nearest neighbor and local calibration 
#  Copyright (C) 2007 Omid Jalali
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
########################################################################

#This is a row pruning method. 
#It uses a train file to find the best neighborhood for the train instances (and uses LC to do the learning and find the estimates).
#It then uses the best neighborhood, and for each test instance, it applies the best neighborhood to the train file and uses LC to find the estimate for each.

BEGIN{
	FS=OFS=",";
	Ignore="?";
	TrainRecordsRead = 0;
	TestRecordsRead = 0;
	DoneOnce = 0;
	MinMreValue = 10^20;
	MinNeighborhood=0;
	E = 2.71828182846;
}

#using the train file, it will first find the MAX and MIN values in it
FILENAME==ARGV[1]{
	#initialize the max and min with the first record
	if (NR==1)
	{
		TotalFields=NF;
		for (fieldCounter=1; fieldCounter<=TotalFields; fieldCounter++)
		{
			TrainInput[NR,fieldCounter]=$fieldCounter;
			TrainMax[fieldCounter]=$fieldCounter;
			TrainMin[fieldCounter]=$fieldCounter;
		}
	}
	else
	{
		for (fieldCounter=1; fieldCounter<=TotalFields; fieldCounter++)
		{
			TrainInput[NR,fieldCounter]=$fieldCounter;
			if ($fieldCounter!=Ignore && TrainMax[fieldCounter]<$fieldCounter)
				TrainMax[fieldCounter]=$fieldCounter;
			if ($fieldCounter!=Ignore && TrainMin[fieldCounter]>$fieldCounter)
				TrainMin[fieldCounter]=$fieldCounter;
		}
	}
	TrainRecordsRead++;
}

#determining distances using the train file only
function calculateTrainDistance(){
	#find the distance of the core instance from all others (and not itself)
	for (trainCoreCounter=1; trainCoreCounter<=TrainRecordsRead; trainCoreCounter++)
	{
		for (trainInstanceCounter=1; trainInstanceCounter<=TrainRecordsRead; trainInstanceCounter++)
		{
			TrainDistance[trainCoreCounter,trainInstanceCounter] = 0;
			if (trainCoreCounter!=trainInstanceCounter)
			{
				for (fieldCounter=1; fieldCounter<=TotalFields; fieldCounter++)
				{
					if (TrainInput[trainInstanceCounter,fieldCounter]!=Ignore && TrainInput[trainCoreCounter,fieldCounter]!=Ignore)
						TrainDistance[trainCoreCounter,trainInstanceCounter]+=( (TrainInput[trainInstanceCounter,fieldCounter]-TrainInput[trainCoreCounter,fieldCounter]) / (TrainMax[fieldCounter]-TrainMin[fieldCounter]+10^(-20)) )^2;
				}
			}
		}
	}
}

#sort each core instance's distances and find the best neighborhood that minimized the median of mre's
function findTrainMre(){
	for (trainCoreCounter=1; trainCoreCounter<=TrainRecordsRead; trainCoreCounter++)
	{
		#prepare for sorting the array for each core instance
		for (trainInstanceCounter=1; trainInstanceCounter<=TrainRecordsRead; trainInstanceCounter++)
		{
			sortedTrainDistanceArray[trainInstanceCounter,1] = TrainDistance[trainCoreCounter,trainInstanceCounter];
			sortedTrainDistanceArray[trainInstanceCounter,2] = TrainInput[trainInstanceCounter,NF];
			sortedTrainDistanceArray[trainInstanceCounter,3] = trainInstanceCounter;
		}

		#sorting the array using insertion sort
		for (i = 1; i <= TrainRecordsRead; i++)
		{
			tempDistance = sortedTrainDistanceArray[i,1];
			tempEffort = sortedTrainDistanceArray[i,2];
			tempOriginalCounter = sortedTrainDistanceArray[i,3];
			j = i;
			
			while ((j > 1) && (sortedTrainDistanceArray[j-1,1] > tempDistance))
			{
				sortedTrainDistanceArray[j,1] = sortedTrainDistanceArray[j-1,1];
				sortedTrainDistanceArray[j,2] = sortedTrainDistanceArray[j-1,2];
				sortedTrainDistanceArray[j,3] = sortedTrainDistanceArray[j-1,3];
				j = j - 1;
			}
			sortedTrainDistanceArray[j,1] = tempDistance;
			sortedTrainDistanceArray[j,2] = tempEffort;
			sortedTrainDistanceArray[j,3] = tempOriginalCounter;
		}

		#try every neighborhood bounded by the number of records in the training file
		for (neighborhoodCounter=2; neighborhoodCounter<=TrainRecordsRead; neighborhoodCounter++)
		{
			TestingCount = 0;
			TrainingCount = 0;
			Sum1 = 0;
			Sum2 = 0;
			Sum3 = 0;
			Sum4 = 0;
		
			#take the closest neighbor instances and run LC on them
			for (counter = 2; counter <= neighborhoodCounter; counter++)
			{
				#to be able to use the values of each field, the original counter should be used as the instance counter
				originalCounter = sortedTrainDistanceArray[counter,3];

				#LC algorithm pass 1 on training data

				# Initialize the effort multiplier.
				EffortMultiplier = 1;
			
				# For the fields after the scalar factors and before the last two...
				for(I=1; I<=(NF-2); I++)
					EffortMultiplier *= TrainInput[originalCounter,I];
			
				# Get the count for lines of code in the thousands.
				Kloc = TrainInput[originalCounter,NF-1];	
				# Get the actual recorded amount of effort in worker months.
				ActualEffort = TrainInput[originalCounter,NF];

				#run LC on the nearest instances to find the estimate
				TrainingCount++;

				# the following test assures that the kloc is not repeating. In some cases, repeating klocs result in a division by zero in A's calculation.
				if (arrayKlocTrain[Kloc] == 1)
					Kloc += 10^(-20);
				arrayKlocTrain[Kloc] = 1;
			
				#calculate the sum values	
				Sum1 += log(Kloc);
				Sum2 += (log(Kloc) * log(Kloc));
				Sum3 += log (ActualEffort / EffortMultiplier);
				Sum4 += (log(Kloc) * log (ActualEffort / EffortMultiplier));
			}

			#LC algorithm pass 2 on the test instance (taken from the train file)
			
			# Initialize the effort multiplier.
			EffortMultiplier = 1;
			
			# For the fields after the scalar factors and before the last two...
			for(I=1; I<=(NF-2); I++)
				EffortMultiplier *= TrainInput[trainCoreCounter,I];
			
			# Get the count for lines of code in the thousands.
			Kloc = TrainInput[trainCoreCounter,NF-1];	
			# Get the actual recorded amount of effort in worker months.
			ActualEffort = TrainInput[trainCoreCounter,NF];

			TestingCount++;
			if(TrainingCount>1)
			{
				# Immediately on entering Pass 2 calculate the A and B values from the information retained from Pass 1
			 	if (TrainingCount*Sum2 - Sum1*Sum1 == 0)
				{	
					A = ( Sum2*Sum3  - Sum1*Sum4 ) / ( TrainingCount*Sum2 - Sum1*Sum1 + 10^(-20));
					B = ( TrainingCount*Sum4 - Sum1*Sum3 ) / ( TrainingCount*Sum2 - Sum1*Sum1 + 10^(-20));
				}
				else
				{
					A = ( Sum2*Sum3  - Sum1*Sum4 ) / ( TrainingCount*Sum2 - Sum1*Sum1 );
					B = ( TrainingCount*Sum4 - Sum1*Sum3 ) / ( TrainingCount*Sum2 - Sum1*Sum1 );
				}
			}
			
			ScaleFactor = 0;

			# Calculate the estimated effort using Boehm's Cocomo formula and then calculate some statistics
			EstimatedEffort = ((B + ScaleFactor) * log(Kloc)) + A + log(EffortMultiplier);
			EstimatedEffort = E ^ EstimatedEffort;
		
			#it is important to note that neighborhood is actually neighborhoodCounter - 1 since neighborhoodCounter = 1 results in a neighborhood of 0 
			#which is not wanted here (hence the counters start from 2 in these loops).
			neighborhood = neighborhoodCounter - 1;

			estimate = EstimatedEffort;

			#actual value is the value from Input array in the last field
			actual = TrainInput[trainCoreCounter,NF];
		
			#find the mre
			mre = (actual-estimate)/actual;
			mre = (mre < 0 ? -1*mre : mre);

			count[neighborhood]++;

			totalMre[neighborhood,count[neighborhood]] = mre;
		}
	}

	for (neighborhoodCounter=1; neighborhoodCounter<=TrainRecordsRead; neighborhoodCounter++)
	{
		#reseting the old values to 0;
		for (i = 1; i <= count[neighborhoodCounter-1]; i++)
			tempNeighborhoodArray[i] = 0;

		#preparing the array for sorting to find the median of mre's
		for (i = 1; i <= count[neighborhoodCounter]; i++)
			tempNeighborhoodArray[i] = totalMre[neighborhoodCounter,i];

		#sorting the array
 		asort(tempNeighborhoodArray);

		#finding the median mre
		if (count[neighborhoodCounter] % 2 == 0)
			medianCount = count[neighborhoodCounter] / 2;
		else
			medianCount = (count[neighborhoodCounter] / 2) + 0.5;

		medianMre[neighborhoodCounter] = tempNeighborhoodArray[medianCount];
	}
}

#now that the training is done, it will do the testing with the instances in the second file
FILENAME==ARGV[2]{
	if (DoneOnce!=1)
	{
		calculateTrainDistance();
		findTrainMre();

		for (neighborhoodCounter=1; neighborhoodCounter<=TrainRecordsRead-1; neighborhoodCounter++)
		{
			if (medianMre[neighborhoodCounter] < MinMreValue)
			{
				MinMreValue = medianMre[neighborhoodCounter];
				MinNeighborhood = neighborhoodCounter;
			}
		}
		DoneOnce = 1;
		#reset NR
		NR = 1;
	}

	#read each test instance and determine the overall min and max for each test instances (individually) and all train instances (already determined)
	for (fieldCounter=1; fieldCounter<=TotalFields; fieldCounter++)
	{
		TestInput[NR,fieldCounter]=$fieldCounter;
		if (TrainMax[fieldCounter] > $fieldCounter)
			TestMax[NR,fieldCounter] = TrainMax[fieldCounter];
		else
			TestMax[NR,fieldCounter] = $fieldCounter;

		if (TrainMin[fieldCounter] < $fieldCounter)
			TestMin[NR,fieldCounter] = TrainMin[fieldCounter];
		else
			TestMin[NR,fieldCounter] = $fieldCounter;
	}
	TestRecordsRead++;

	#the MinNeighborhood was determined and can be used
}

#using the instances in the train file, each test instance gets its distances calculated
function calculateTestDistance(){
	#find the distance of each test instance from all train instances
	for (testInstanceCounter=1; testInstanceCounter<=TestRecordsRead; testInstanceCounter++)
	{
		for (trainInstanceCounter=1; trainInstanceCounter<=TrainRecordsRead; trainInstanceCounter++)
		{
			TestDistance[testInstanceCounter,trainInstanceCounter] = 0;
			#it does not use the actual effort since it should not be known in testing
			for (fieldCounter=1; fieldCounter<=TotalFields-1; fieldCounter++)
			{
				if (TrainInput[trainInstanceCounter,fieldCounter]!=Ignore && TestInput[testInstanceCounter,fieldCounter]!=Ignore)
					TestDistance[testInstanceCounter,trainInstanceCounter]+=( (TrainInput[trainInstanceCounter,fieldCounter]-TestInput[testInstanceCounter,fieldCounter]) / (TestMax[testInstanceCounter,fieldCounter]-TestMin[testInstanceCounter,fieldCounter]+10^(-20)) )^2;
			}
		}
	}
}

#sort each core instance's distances and find the best neighborhood that minimized the sum of mre's
function findTestEstimate(){
	for (testInstanceCounter=1; testInstanceCounter<=TestRecordsRead; testInstanceCounter++)
	{
		#prepare for sorting the array for each core instance
		for (trainInstanceCounter=1; trainInstanceCounter<=TrainRecordsRead; trainInstanceCounter++)
		{
			sortedTestDistanceArray[trainInstanceCounter,1] = TestDistance[testInstanceCounter,trainInstanceCounter];
			sortedTestDistanceArray[trainInstanceCounter,2] = TrainInput[trainInstanceCounter,NF];
			sortedTestDistanceArray[trainInstanceCounter,3] = trainInstanceCounter;
		}

		#sorting the array using insertion sort
		for (i = 1; i <= TrainRecordsRead; i++)
		{
			tempDistance = sortedTestDistanceArray[i,1];
			tempEffort = sortedTestDistanceArray[i,2];
			tempOriginalCounter = sortedTestDistanceArray[i,3];
			j = i;
			
			while ((j > 1) && (sortedTestDistanceArray[j-1,1] > tempDistance))
			{
				sortedTestDistanceArray[j,1] = sortedTestDistanceArray[j-1,1];
				sortedTestDistanceArray[j,2] = sortedTestDistanceArray[j-1,2];
				sortedTestDistanceArray[j,3] = sortedTestDistanceArray[j-1,3];
				j = j - 1;
			}
			sortedTestDistanceArray[j,1] = tempDistance;
			sortedTestDistanceArray[j,2] = tempEffort;
			sortedTestDistanceArray[j,3] = tempOriginalCounter;
		}

		#print the first MinNeighborhoods' estimate using LC from the sorted array 
		TestingCount = 0;
		TrainingCount = 0;
		Sum1 = 0;
		Sum2 = 0;
		Sum3 = 0;
		Sum4 = 0;
		
		#take the closest neighbor instances and run LC on them
		for (counter = 2; counter <= MinNeighborhood; counter++)
		{
			#to be able to use the values of each field, the original counter should be used as the instance counter
			originalCounter = sortedTestDistanceArray[counter,3];

			#LC algorithm pass 1 on training data

			# Initialize the effort multiplier.
			EffortMultiplier = 1;

			# For the fields after the scalar factors and before the last two...
			for(I=1; I<=(NF-2); I++)
				EffortMultiplier *= TrainInput[originalCounter,I];

			# Get the count for lines of code in the thousands.
			Kloc = TrainInput[originalCounter,NF-1];	
			# Get the actual recorded amount of effort in worker months.
			ActualEffort = TrainInput[originalCounter,NF];

			#run LC on the nearest instances to find the estimate
			TrainingCount++;

			# the following test assures that the kloc is not repeating. In some cases, repeating klocs result in a division by zero in A's calculation.
			if (arrayKlocTest[Kloc] == 1)
				Kloc += 10^(-20);
			arrayKlocTest[Kloc] = 1;

			#calculate the sum values	
			Sum1 += log(Kloc);
			Sum2 += (log(Kloc) * log(Kloc));
			Sum3 += log (ActualEffort / EffortMultiplier);
			Sum4 += (log(Kloc) * log (ActualEffort / EffortMultiplier));
		}

		#LC algorithm pass 2 on the test instance (taken from the test file)

		# Initialize the effort multiplier.
		EffortMultiplier = 1;

		# For the fields after the scalar factors and before the last two...
		for(I=1; I<=(NF-2); I++)
			EffortMultiplier *= TestInput[testInstanceCounter,I];

		# Get the count for lines of code in the thousands.
		Kloc = TestInput[testInstanceCounter,NF-1];	
		# Get the actual recorded amount of effort in worker months.
		ActualEffort = TestInput[testInstanceCounter,NF];

		TestingCount++;
		if(TrainingCount>1)
		{
			# Immediately on entering Pass 2 calculate the A and B values from the information retained from Pass 1
			if (TrainingCount*Sum2 - Sum1*Sum1 == 0)
			{	
				A = ( Sum2*Sum3  - Sum1*Sum4 ) / ( TrainingCount*Sum2 - Sum1*Sum1 + 10^(-20));
				B = ( TrainingCount*Sum4 - Sum1*Sum3 ) / ( TrainingCount*Sum2 - Sum1*Sum1 + 10^(-20));
			}
			else
			{
				A = ( Sum2*Sum3  - Sum1*Sum4 ) / ( TrainingCount*Sum2 - Sum1*Sum1 );
				B = ( TrainingCount*Sum4 - Sum1*Sum3 ) / ( TrainingCount*Sum2 - Sum1*Sum1 );
			}
		}

		ScaleFactor = 0;

		# Calculate the estimated effort using Boehm's Cocomo formula and then calculate some statistics
		EstimatedEffort = ((B + ScaleFactor) * log(Kloc)) + A + log(EffortMultiplier);
		EstimatedEffort = E ^ EstimatedEffort;

		estimate = EstimatedEffort;

		#actual value is the value from Input array in the last field
		actual = TestInput[testInstanceCounter,NF];

		print estimate, actual, MinNeighborhood;
	}
}

END{
	calculateTestDistance();
	findTestEstimate();
}