#!/usr/bin/gawk -f ######################################################################## # dynamic locomo : row pruning method using nearest neighbor and local calibration # Copyright (C) 2007 Omid Jalali # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . ######################################################################## #This is a row pruning method. #It uses a train file to find the best neighborhood for the train instances (and uses LC to do the learning and find the estimates). #It then uses the best neighborhood, and for each test instance, it applies the best neighborhood to the train file and uses LC to find the estimate for each. BEGIN{ FS=OFS=","; Ignore="?"; TrainRecordsRead = 0; TestRecordsRead = 0; DoneOnce = 0; MinMreValue = 10^20; MinNeighborhood=0; E = 2.71828182846; } #using the train file, it will first find the MAX and MIN values in it FILENAME==ARGV[1]{ #initialize the max and min with the first record if (NR==1) { TotalFields=NF; for (fieldCounter=1; fieldCounter<=TotalFields; fieldCounter++) { TrainInput[NR,fieldCounter]=$fieldCounter; TrainMax[fieldCounter]=$fieldCounter; TrainMin[fieldCounter]=$fieldCounter; } } else { for (fieldCounter=1; fieldCounter<=TotalFields; fieldCounter++) { TrainInput[NR,fieldCounter]=$fieldCounter; if ($fieldCounter!=Ignore && TrainMax[fieldCounter]<$fieldCounter) TrainMax[fieldCounter]=$fieldCounter; if ($fieldCounter!=Ignore && TrainMin[fieldCounter]>$fieldCounter) TrainMin[fieldCounter]=$fieldCounter; } } TrainRecordsRead++; } #determining distances using the train file only function calculateTrainDistance(){ #find the distance of the core instance from all others (and not itself) for (trainCoreCounter=1; trainCoreCounter<=TrainRecordsRead; trainCoreCounter++) { for (trainInstanceCounter=1; trainInstanceCounter<=TrainRecordsRead; trainInstanceCounter++) { TrainDistance[trainCoreCounter,trainInstanceCounter] = 0; if (trainCoreCounter!=trainInstanceCounter) { for (fieldCounter=1; fieldCounter<=TotalFields; fieldCounter++) { if (TrainInput[trainInstanceCounter,fieldCounter]!=Ignore && TrainInput[trainCoreCounter,fieldCounter]!=Ignore) TrainDistance[trainCoreCounter,trainInstanceCounter]+=( (TrainInput[trainInstanceCounter,fieldCounter]-TrainInput[trainCoreCounter,fieldCounter]) / (TrainMax[fieldCounter]-TrainMin[fieldCounter]+10^(-20)) )^2; } } } } } #sort each core instance's distances and find the best neighborhood that minimized the median of mre's function findTrainMre(){ for (trainCoreCounter=1; trainCoreCounter<=TrainRecordsRead; trainCoreCounter++) { #prepare for sorting the array for each core instance for (trainInstanceCounter=1; trainInstanceCounter<=TrainRecordsRead; trainInstanceCounter++) { sortedTrainDistanceArray[trainInstanceCounter,1] = TrainDistance[trainCoreCounter,trainInstanceCounter]; sortedTrainDistanceArray[trainInstanceCounter,2] = TrainInput[trainInstanceCounter,NF]; sortedTrainDistanceArray[trainInstanceCounter,3] = trainInstanceCounter; } #sorting the array using insertion sort for (i = 1; i <= TrainRecordsRead; i++) { tempDistance = sortedTrainDistanceArray[i,1]; tempEffort = sortedTrainDistanceArray[i,2]; tempOriginalCounter = sortedTrainDistanceArray[i,3]; j = i; while ((j > 1) && (sortedTrainDistanceArray[j-1,1] > tempDistance)) { sortedTrainDistanceArray[j,1] = sortedTrainDistanceArray[j-1,1]; sortedTrainDistanceArray[j,2] = sortedTrainDistanceArray[j-1,2]; sortedTrainDistanceArray[j,3] = sortedTrainDistanceArray[j-1,3]; j = j - 1; } sortedTrainDistanceArray[j,1] = tempDistance; sortedTrainDistanceArray[j,2] = tempEffort; sortedTrainDistanceArray[j,3] = tempOriginalCounter; } #try every neighborhood bounded by the number of records in the training file for (neighborhoodCounter=2; neighborhoodCounter<=TrainRecordsRead; neighborhoodCounter++) { TestingCount = 0; TrainingCount = 0; Sum1 = 0; Sum2 = 0; Sum3 = 0; Sum4 = 0; #take the closest neighbor instances and run LC on them for (counter = 2; counter <= neighborhoodCounter; counter++) { #to be able to use the values of each field, the original counter should be used as the instance counter originalCounter = sortedTrainDistanceArray[counter,3]; #LC algorithm pass 1 on training data # Initialize the effort multiplier. EffortMultiplier = 1; # For the fields after the scalar factors and before the last two... for(I=1; I<=(NF-2); I++) EffortMultiplier *= TrainInput[originalCounter,I]; # Get the count for lines of code in the thousands. Kloc = TrainInput[originalCounter,NF-1]; # Get the actual recorded amount of effort in worker months. ActualEffort = TrainInput[originalCounter,NF]; #run LC on the nearest instances to find the estimate TrainingCount++; # the following test assures that the kloc is not repeating. In some cases, repeating klocs result in a division by zero in A's calculation. if (arrayKlocTrain[Kloc] == 1) Kloc += 10^(-20); arrayKlocTrain[Kloc] = 1; #calculate the sum values Sum1 += log(Kloc); Sum2 += (log(Kloc) * log(Kloc)); Sum3 += log (ActualEffort / EffortMultiplier); Sum4 += (log(Kloc) * log (ActualEffort / EffortMultiplier)); } #LC algorithm pass 2 on the test instance (taken from the train file) # Initialize the effort multiplier. EffortMultiplier = 1; # For the fields after the scalar factors and before the last two... for(I=1; I<=(NF-2); I++) EffortMultiplier *= TrainInput[trainCoreCounter,I]; # Get the count for lines of code in the thousands. Kloc = TrainInput[trainCoreCounter,NF-1]; # Get the actual recorded amount of effort in worker months. ActualEffort = TrainInput[trainCoreCounter,NF]; TestingCount++; if(TrainingCount>1) { # Immediately on entering Pass 2 calculate the A and B values from the information retained from Pass 1 if (TrainingCount*Sum2 - Sum1*Sum1 == 0) { A = ( Sum2*Sum3 - Sum1*Sum4 ) / ( TrainingCount*Sum2 - Sum1*Sum1 + 10^(-20)); B = ( TrainingCount*Sum4 - Sum1*Sum3 ) / ( TrainingCount*Sum2 - Sum1*Sum1 + 10^(-20)); } else { A = ( Sum2*Sum3 - Sum1*Sum4 ) / ( TrainingCount*Sum2 - Sum1*Sum1 ); B = ( TrainingCount*Sum4 - Sum1*Sum3 ) / ( TrainingCount*Sum2 - Sum1*Sum1 ); } } ScaleFactor = 0; # Calculate the estimated effort using Boehm's Cocomo formula and then calculate some statistics EstimatedEffort = ((B + ScaleFactor) * log(Kloc)) + A + log(EffortMultiplier); EstimatedEffort = E ^ EstimatedEffort; #it is important to note that neighborhood is actually neighborhoodCounter - 1 since neighborhoodCounter = 1 results in a neighborhood of 0 #which is not wanted here (hence the counters start from 2 in these loops). neighborhood = neighborhoodCounter - 1; estimate = EstimatedEffort; #actual value is the value from Input array in the last field actual = TrainInput[trainCoreCounter,NF]; #find the mre mre = (actual-estimate)/actual; mre = (mre < 0 ? -1*mre : mre); count[neighborhood]++; totalMre[neighborhood,count[neighborhood]] = mre; } } for (neighborhoodCounter=1; neighborhoodCounter<=TrainRecordsRead; neighborhoodCounter++) { #reseting the old values to 0; for (i = 1; i <= count[neighborhoodCounter-1]; i++) tempNeighborhoodArray[i] = 0; #preparing the array for sorting to find the median of mre's for (i = 1; i <= count[neighborhoodCounter]; i++) tempNeighborhoodArray[i] = totalMre[neighborhoodCounter,i]; #sorting the array asort(tempNeighborhoodArray); #finding the median mre if (count[neighborhoodCounter] % 2 == 0) medianCount = count[neighborhoodCounter] / 2; else medianCount = (count[neighborhoodCounter] / 2) + 0.5; medianMre[neighborhoodCounter] = tempNeighborhoodArray[medianCount]; } } #now that the training is done, it will do the testing with the instances in the second file FILENAME==ARGV[2]{ if (DoneOnce!=1) { calculateTrainDistance(); findTrainMre(); for (neighborhoodCounter=1; neighborhoodCounter<=TrainRecordsRead-1; neighborhoodCounter++) { if (medianMre[neighborhoodCounter] < MinMreValue) { MinMreValue = medianMre[neighborhoodCounter]; MinNeighborhood = neighborhoodCounter; } } DoneOnce = 1; #reset NR NR = 1; } #read each test instance and determine the overall min and max for each test instances (individually) and all train instances (already determined) for (fieldCounter=1; fieldCounter<=TotalFields; fieldCounter++) { TestInput[NR,fieldCounter]=$fieldCounter; if (TrainMax[fieldCounter] > $fieldCounter) TestMax[NR,fieldCounter] = TrainMax[fieldCounter]; else TestMax[NR,fieldCounter] = $fieldCounter; if (TrainMin[fieldCounter] < $fieldCounter) TestMin[NR,fieldCounter] = TrainMin[fieldCounter]; else TestMin[NR,fieldCounter] = $fieldCounter; } TestRecordsRead++; #the MinNeighborhood was determined and can be used } #using the instances in the train file, each test instance gets its distances calculated function calculateTestDistance(){ #find the distance of each test instance from all train instances for (testInstanceCounter=1; testInstanceCounter<=TestRecordsRead; testInstanceCounter++) { for (trainInstanceCounter=1; trainInstanceCounter<=TrainRecordsRead; trainInstanceCounter++) { TestDistance[testInstanceCounter,trainInstanceCounter] = 0; #it does not use the actual effort since it should not be known in testing for (fieldCounter=1; fieldCounter<=TotalFields-1; fieldCounter++) { if (TrainInput[trainInstanceCounter,fieldCounter]!=Ignore && TestInput[testInstanceCounter,fieldCounter]!=Ignore) TestDistance[testInstanceCounter,trainInstanceCounter]+=( (TrainInput[trainInstanceCounter,fieldCounter]-TestInput[testInstanceCounter,fieldCounter]) / (TestMax[testInstanceCounter,fieldCounter]-TestMin[testInstanceCounter,fieldCounter]+10^(-20)) )^2; } } } } #sort each core instance's distances and find the best neighborhood that minimized the sum of mre's function findTestEstimate(){ for (testInstanceCounter=1; testInstanceCounter<=TestRecordsRead; testInstanceCounter++) { #prepare for sorting the array for each core instance for (trainInstanceCounter=1; trainInstanceCounter<=TrainRecordsRead; trainInstanceCounter++) { sortedTestDistanceArray[trainInstanceCounter,1] = TestDistance[testInstanceCounter,trainInstanceCounter]; sortedTestDistanceArray[trainInstanceCounter,2] = TrainInput[trainInstanceCounter,NF]; sortedTestDistanceArray[trainInstanceCounter,3] = trainInstanceCounter; } #sorting the array using insertion sort for (i = 1; i <= TrainRecordsRead; i++) { tempDistance = sortedTestDistanceArray[i,1]; tempEffort = sortedTestDistanceArray[i,2]; tempOriginalCounter = sortedTestDistanceArray[i,3]; j = i; while ((j > 1) && (sortedTestDistanceArray[j-1,1] > tempDistance)) { sortedTestDistanceArray[j,1] = sortedTestDistanceArray[j-1,1]; sortedTestDistanceArray[j,2] = sortedTestDistanceArray[j-1,2]; sortedTestDistanceArray[j,3] = sortedTestDistanceArray[j-1,3]; j = j - 1; } sortedTestDistanceArray[j,1] = tempDistance; sortedTestDistanceArray[j,2] = tempEffort; sortedTestDistanceArray[j,3] = tempOriginalCounter; } #print the first MinNeighborhoods' estimate using LC from the sorted array TestingCount = 0; TrainingCount = 0; Sum1 = 0; Sum2 = 0; Sum3 = 0; Sum4 = 0; #take the closest neighbor instances and run LC on them for (counter = 2; counter <= MinNeighborhood; counter++) { #to be able to use the values of each field, the original counter should be used as the instance counter originalCounter = sortedTestDistanceArray[counter,3]; #LC algorithm pass 1 on training data # Initialize the effort multiplier. EffortMultiplier = 1; # For the fields after the scalar factors and before the last two... for(I=1; I<=(NF-2); I++) EffortMultiplier *= TrainInput[originalCounter,I]; # Get the count for lines of code in the thousands. Kloc = TrainInput[originalCounter,NF-1]; # Get the actual recorded amount of effort in worker months. ActualEffort = TrainInput[originalCounter,NF]; #run LC on the nearest instances to find the estimate TrainingCount++; # the following test assures that the kloc is not repeating. In some cases, repeating klocs result in a division by zero in A's calculation. if (arrayKlocTest[Kloc] == 1) Kloc += 10^(-20); arrayKlocTest[Kloc] = 1; #calculate the sum values Sum1 += log(Kloc); Sum2 += (log(Kloc) * log(Kloc)); Sum3 += log (ActualEffort / EffortMultiplier); Sum4 += (log(Kloc) * log (ActualEffort / EffortMultiplier)); } #LC algorithm pass 2 on the test instance (taken from the test file) # Initialize the effort multiplier. EffortMultiplier = 1; # For the fields after the scalar factors and before the last two... for(I=1; I<=(NF-2); I++) EffortMultiplier *= TestInput[testInstanceCounter,I]; # Get the count for lines of code in the thousands. Kloc = TestInput[testInstanceCounter,NF-1]; # Get the actual recorded amount of effort in worker months. ActualEffort = TestInput[testInstanceCounter,NF]; TestingCount++; if(TrainingCount>1) { # Immediately on entering Pass 2 calculate the A and B values from the information retained from Pass 1 if (TrainingCount*Sum2 - Sum1*Sum1 == 0) { A = ( Sum2*Sum3 - Sum1*Sum4 ) / ( TrainingCount*Sum2 - Sum1*Sum1 + 10^(-20)); B = ( TrainingCount*Sum4 - Sum1*Sum3 ) / ( TrainingCount*Sum2 - Sum1*Sum1 + 10^(-20)); } else { A = ( Sum2*Sum3 - Sum1*Sum4 ) / ( TrainingCount*Sum2 - Sum1*Sum1 ); B = ( TrainingCount*Sum4 - Sum1*Sum3 ) / ( TrainingCount*Sum2 - Sum1*Sum1 ); } } ScaleFactor = 0; # Calculate the estimated effort using Boehm's Cocomo formula and then calculate some statistics EstimatedEffort = ((B + ScaleFactor) * log(Kloc)) + A + log(EffortMultiplier); EstimatedEffort = E ^ EstimatedEffort; estimate = EstimatedEffort; #actual value is the value from Input array in the last field actual = TestInput[testInstanceCounter,NF]; print estimate, actual, MinNeighborhood; } } END{ calculateTestDistance(); findTestEstimate(); }