#!/usr/bin/gawk -f #This is a row pruning method. #It uses a train file to find the best neighborhood for the train instances (and uses LC to do the learning and find the estimates). #It then uses the best neighborhood, and for each test instance, it applies the best neighborhood to the train file and uses LC to find the estimate for each. BEGIN{ FS=OFS=","; Ignore="?"; TrainRecordsRead = 0; TestRecordsRead = 0; DoneOnce = 0; MinMreValue = 10^20; MinNeighborhood=0; E = 2.71828182846; } #using the train file, it will first find the MAX and MIN values in it FILENAME==ARGV[1]{ #initialize the max and min with the first record if (NR==1) { TotalFields=NF; for (fieldCounter=1; fieldCounter<=TotalFields; fieldCounter++) { TrainInput[NR,fieldCounter]=$fieldCounter; TrainMax[fieldCounter]=$fieldCounter; TrainMin[fieldCounter]=$fieldCounter; } } else { for (fieldCounter=1; fieldCounter<=TotalFields; fieldCounter++) { TrainInput[NR,fieldCounter]=$fieldCounter; if ($fieldCounter!=Ignore && TrainMax[fieldCounter]<$fieldCounter) TrainMax[fieldCounter]=$fieldCounter; if ($fieldCounter!=Ignore && TrainMin[fieldCounter]>$fieldCounter) TrainMin[fieldCounter]=$fieldCounter; } } TrainRecordsRead++; } #determining distances using the train file only function calculateTrainDistance(){ #find the distance of the core instance from all others (and not itself) for (trainCoreCounter=1; trainCoreCounter<=TrainRecordsRead; trainCoreCounter++) { for (trainInstanceCounter=1; trainInstanceCounter<=TrainRecordsRead; trainInstanceCounter++) { TrainDistance[trainCoreCounter,trainInstanceCounter] = 0; if (trainCoreCounter!=trainInstanceCounter) { for (fieldCounter=1; fieldCounter<=TotalFields; fieldCounter++) { if (TrainInput[trainInstanceCounter,fieldCounter]!=Ignore && TrainInput[trainCoreCounter,fieldCounter]!=Ignore) TrainDistance[trainCoreCounter,trainInstanceCounter]+=( (TrainInput[trainInstanceCounter,fieldCounter]-TrainInput[trainCoreCounter,fieldCounter]) / (TrainMax[fieldCounter]-TrainMin[fieldCounter]+10^(-20)) )^2; } } } } } #sort each core instance's distances and find the best neighborhood that minimized the mean of mre's function findTrainMre(){ for (trainCoreCounter=1; trainCoreCounter<=TrainRecordsRead; trainCoreCounter++) { #prepare for sorting the array for each core instance for (trainInstanceCounter=1; trainInstanceCounter<=TrainRecordsRead; trainInstanceCounter++) { sortedTrainDistanceArray[trainInstanceCounter,1] = TrainDistance[trainCoreCounter,trainInstanceCounter]; sortedTrainDistanceArray[trainInstanceCounter,2] = TrainInput[trainInstanceCounter,NF]; sortedTrainDistanceArray[trainInstanceCounter,3] = trainInstanceCounter; } #sorting the array using insertion sort for (i = 1; i <= TrainRecordsRead; i++) { tempDistance = sortedTrainDistanceArray[i,1]; tempEffort = sortedTrainDistanceArray[i,2]; tempOriginalCounter = sortedTrainDistanceArray[i,3]; j = i; while ((j > 1) && (sortedTrainDistanceArray[j-1,1] > tempDistance)) { sortedTrainDistanceArray[j,1] = sortedTrainDistanceArray[j-1,1]; sortedTrainDistanceArray[j,2] = sortedTrainDistanceArray[j-1,2]; sortedTrainDistanceArray[j,3] = sortedTrainDistanceArray[j-1,3]; j = j - 1; } sortedTrainDistanceArray[j,1] = tempDistance; sortedTrainDistanceArray[j,2] = tempEffort; sortedTrainDistanceArray[j,3] = tempOriginalCounter; } #try every neighborhood bounded by the number of records in the training file for (neighborhoodCounter=2; neighborhoodCounter<=TrainRecordsRead; neighborhoodCounter++) { TestingCount = 0; TrainingCount = 0; Sum1 = 0; Sum2 = 0; Sum3 = 0; Sum4 = 0; #take the closest neighbor instances and run LC on them for (counter = 2; counter <= neighborhoodCounter; counter++) { #to be able to use the values of each field, the original counter should be used as the instance counter originalCounter = sortedTrainDistanceArray[counter,3]; #LC algorithm pass 1 on training data # Initialize the effort multiplier. EffortMultiplier = 1; # For the fields after the scalar factors and before the last two... for(I=1; I<=(NF-2); I++) EffortMultiplier *= TrainInput[originalCounter,I]; # Get the count for lines of code in the thousands. Kloc = TrainInput[originalCounter,NF-1]; # Get the actual recorded amount of effort in worker months. ActualEffort = TrainInput[originalCounter,NF]; #run LC on the nearest instances to find the estimate TrainingCount++; # the following test assures that the kloc is not repeating. In some cases, repeating klocs result in a division by zero in A's calculation. if (arrayKlocTrain[Kloc] == 1) Kloc += 10^(-20); arrayKlocTrain[Kloc] = 1; #calculate the sum values Sum1 += log(Kloc); Sum2 += (log(Kloc) * log(Kloc)); Sum3 += log (ActualEffort / EffortMultiplier); Sum4 += (log(Kloc) * log (ActualEffort / EffortMultiplier)); } #LC algorithm pass 2 on the test instance (taken from the train file) # Initialize the effort multiplier. EffortMultiplier = 1; # For the fields after the scalar factors and before the last two... for(I=1; I<=(NF-2); I++) EffortMultiplier *= TrainInput[trainCoreCounter,I]; # Get the count for lines of code in the thousands. Kloc = TrainInput[trainCoreCounter,NF-1]; # Get the actual recorded amount of effort in worker months. ActualEffort = TrainInput[trainCoreCounter,NF]; TestingCount++; if(TrainingCount>1) { # Immediately on entering Pass 2 calculate the A and B values from the information retained from Pass 1 if (TrainingCount*Sum2 - Sum1*Sum1 == 0) { A = ( Sum2*Sum3 - Sum1*Sum4 ) / ( TrainingCount*Sum2 - Sum1*Sum1 + 10^(-20)); B = ( TrainingCount*Sum4 - Sum1*Sum3 ) / ( TrainingCount*Sum2 - Sum1*Sum1 + 10^(-20)); } else { A = ( Sum2*Sum3 - Sum1*Sum4 ) / ( TrainingCount*Sum2 - Sum1*Sum1 ); B = ( TrainingCount*Sum4 - Sum1*Sum3 ) / ( TrainingCount*Sum2 - Sum1*Sum1 ); } } ScaleFactor = 0; # Calculate the estimated effort using Boehm's Cocomo formula and then calculate some statistics EstimatedEffort = ((B + ScaleFactor) * log(Kloc)) + A + log(EffortMultiplier); EstimatedEffort = E ^ EstimatedEffort; #it is important to note that neighborhood is actually neighborhoodCounter - 1 since neighborhoodCounter = 1 results in a neighborhood of 0 #which is not wanted here (hence the counters start from 2 in these loops). neighborhood = neighborhoodCounter - 1; estimate = EstimatedEffort; #actual value is the value from Input array in the last field actual = TrainInput[trainCoreCounter,NF]; #find the mre mre = (actual-estimate)/actual; mre = (mre < 0 ? -1*mre : mre); totalMre[neighborhood] += mre; } } } #now that the training is done, it will do the testing with the instances in the second file FILENAME==ARGV[2]{ if (DoneOnce!=1) { calculateTrainDistance(); findTrainMre(); for (neighborhoodCounter=1; neighborhoodCounter<=TrainRecordsRead-1; neighborhoodCounter++) { totalMre[neighborhoodCounter] = totalMre[neighborhoodCounter]/TrainRecordsRead; if (totalMre[neighborhoodCounter] < MinMreValue) { MinMreValue = totalMre[neighborhoodCounter]; MinNeighborhood = neighborhoodCounter; } } DoneOnce = 1; #reset NR NR = 1; } #read each test instance and determine the overall min and max for each test instances (individually) and all train instances (already determined) for (fieldCounter=1; fieldCounter<=TotalFields; fieldCounter++) { TestInput[NR,fieldCounter]=$fieldCounter; if (TrainMax[fieldCounter] > $fieldCounter) TestMax[NR,fieldCounter] = TrainMax[fieldCounter]; else TestMax[NR,fieldCounter] = $fieldCounter; if (TrainMin[fieldCounter] < $fieldCounter) TestMin[NR,fieldCounter] = TrainMin[fieldCounter]; else TestMin[NR,fieldCounter] = $fieldCounter; } TestRecordsRead++; #the MinNeighborhood was determined and can be used } #using the instances in the train file, each test instance gets its distances calculated function calculateTestDistance(){ #find the distance of each test instance from all train instances for (testInstanceCounter=1; testInstanceCounter<=TestRecordsRead; testInstanceCounter++) { for (trainInstanceCounter=1; trainInstanceCounter<=TrainRecordsRead; trainInstanceCounter++) { TestDistance[testInstanceCounter,trainInstanceCounter] = 0; #it does not use the actual effort since it should not be known in testing for (fieldCounter=1; fieldCounter<=TotalFields-1; fieldCounter++) { if (TrainInput[trainInstanceCounter,fieldCounter]!=Ignore && TestInput[testInstanceCounter,fieldCounter]!=Ignore) TestDistance[testInstanceCounter,trainInstanceCounter]+=( (TrainInput[trainInstanceCounter,fieldCounter]-TestInput[testInstanceCounter,fieldCounter]) / (TestMax[testInstanceCounter,fieldCounter]-TestMin[testInstanceCounter,fieldCounter]+10^(-20)) )^2; } } } } #sort each core instance's distances and find the best neighborhood that minimized the sum of mre's function findTestEstimate(){ for (testInstanceCounter=1; testInstanceCounter<=TestRecordsRead; testInstanceCounter++) { #prepare for sorting the array for each core instance for (trainInstanceCounter=1; trainInstanceCounter<=TrainRecordsRead; trainInstanceCounter++) { sortedTestDistanceArray[trainInstanceCounter,1] = TestDistance[testInstanceCounter,trainInstanceCounter]; sortedTestDistanceArray[trainInstanceCounter,2] = TrainInput[trainInstanceCounter,NF]; sortedTestDistanceArray[trainInstanceCounter,3] = trainInstanceCounter; } #sorting the array using insertion sort for (i = 1; i <= TrainRecordsRead; i++) { tempDistance = sortedTestDistanceArray[i,1]; tempEffort = sortedTestDistanceArray[i,2]; tempOriginalCounter = sortedTestDistanceArray[i,3]; j = i; while ((j > 1) && (sortedTestDistanceArray[j-1,1] > tempDistance)) { sortedTestDistanceArray[j,1] = sortedTestDistanceArray[j-1,1]; sortedTestDistanceArray[j,2] = sortedTestDistanceArray[j-1,2]; sortedTestDistanceArray[j,3] = sortedTestDistanceArray[j-1,3]; j = j - 1; } sortedTestDistanceArray[j,1] = tempDistance; sortedTestDistanceArray[j,2] = tempEffort; sortedTestDistanceArray[j,3] = tempOriginalCounter; } #print the first MinNeighborhoods' estimate using LC from the sorted array TestingCount = 0; TrainingCount = 0; Sum1 = 0; Sum2 = 0; Sum3 = 0; Sum4 = 0; #take the closest neighbor instances and run LC on them for (counter = 2; counter <= MinNeighborhood; counter++) { #to be able to use the values of each field, the original counter should be used as the instance counter originalCounter = sortedTestDistanceArray[counter,3]; #LC algorithm pass 1 on training data # Initialize the effort multiplier. EffortMultiplier = 1; # For the fields after the scalar factors and before the last two... for(I=1; I<=(NF-2); I++) EffortMultiplier *= TrainInput[originalCounter,I]; # Get the count for lines of code in the thousands. Kloc = TrainInput[originalCounter,NF-1]; # Get the actual recorded amount of effort in worker months. ActualEffort = TrainInput[originalCounter,NF]; #run LC on the nearest instances to find the estimate TrainingCount++; # the following test assures that the kloc is not repeating. In some cases, repeating klocs result in a division by zero in A's calculation. if (arrayKlocTest[Kloc] == 1) Kloc += 10^(-20); arrayKlocTest[Kloc] = 1; #calculate the sum values Sum1 += log(Kloc); Sum2 += (log(Kloc) * log(Kloc)); Sum3 += log (ActualEffort / EffortMultiplier); Sum4 += (log(Kloc) * log (ActualEffort / EffortMultiplier)); } #LC algorithm pass 2 on the test instance (taken from the test file) # Initialize the effort multiplier. EffortMultiplier = 1; # For the fields after the scalar factors and before the last two... for(I=1; I<=(NF-2); I++) EffortMultiplier *= TestInput[testInstanceCounter,I]; # Get the count for lines of code in the thousands. Kloc = TestInput[testInstanceCounter,NF-1]; # Get the actual recorded amount of effort in worker months. ActualEffort = TestInput[testInstanceCounter,NF]; TestingCount++; if(TrainingCount>1) { # Immediately on entering Pass 2 calculate the A and B values from the information retained from Pass 1 if (TrainingCount*Sum2 - Sum1*Sum1 == 0) { A = ( Sum2*Sum3 - Sum1*Sum4 ) / ( TrainingCount*Sum2 - Sum1*Sum1 + 10^(-20)); B = ( TrainingCount*Sum4 - Sum1*Sum3 ) / ( TrainingCount*Sum2 - Sum1*Sum1 + 10^(-20)); } else { A = ( Sum2*Sum3 - Sum1*Sum4 ) / ( TrainingCount*Sum2 - Sum1*Sum1 ); B = ( TrainingCount*Sum4 - Sum1*Sum3 ) / ( TrainingCount*Sum2 - Sum1*Sum1 ); } } ScaleFactor = 0; # Calculate the estimated effort using Boehm's Cocomo formula and then calculate some statistics EstimatedEffort = ((B + ScaleFactor) * log(Kloc)) + A + log(EffortMultiplier); EstimatedEffort = E ^ EstimatedEffort; estimate = EstimatedEffort; #actual value is the value from Input array in the last field actual = TestInput[testInstanceCounter,NF]; print estimate, actual, MinNeighborhood; } } END{ calculateTestDistance(); findTestEstimate(); }