#!/usr/bin/gawk -f #this is based on training using MRE BEGIN{ FS=OFS=","; Ignore="?"; TrainRecordsRead = 0; TestRecordsRead = 0; DoneOnce = 0; MinMreValue = 10^20; MinNeighborhood=0; } #using the train file, it will first find the MAX and MIN values in it FILENAME==ARGV[1]{ #initialize the max and min with the first record if (NR==1) { TotalFields=NF; for (fieldCounter=1; fieldCounter<=TotalFields; fieldCounter++) { TrainInput[NR,fieldCounter]=$fieldCounter; TrainMax[fieldCounter]=$fieldCounter; TrainMin[fieldCounter]=$fieldCounter; } } else { for (fieldCounter=1; fieldCounter<=TotalFields; fieldCounter++) { TrainInput[NR,fieldCounter]=$fieldCounter; if ($fieldCounter!=Ignore && TrainMax[fieldCounter]<$fieldCounter) TrainMax[fieldCounter]=$fieldCounter; if ($fieldCounter!=Ignore && TrainMin[fieldCounter]>$fieldCounter) TrainMin[fieldCounter]=$fieldCounter; } } TrainRecordsRead++; } #determining distances using the train file only function calculateTrainDistance(){ #find the distance of the core instance from all others (and not itself) for (trainCoreCounter=1; trainCoreCounter<=TrainRecordsRead; trainCoreCounter++) { for (trainInstanceCounter=1; trainInstanceCounter<=TrainRecordsRead; trainInstanceCounter++) { TrainDistance[trainCoreCounter,trainInstanceCounter] = 0; if (trainCoreCounter!=trainInstanceCounter) { for (fieldCounter=1; fieldCounter<=TotalFields; fieldCounter++) { if (TrainInput[trainInstanceCounter,fieldCounter]!=Ignore && TrainInput[trainCoreCounter,fieldCounter]!=Ignore) TrainDistance[trainCoreCounter,trainInstanceCounter]+=( (TrainInput[trainInstanceCounter,fieldCounter]-TrainInput[trainCoreCounter,fieldCounter]) / (TrainMax[fieldCounter]-TrainMin[fieldCounter]+10^(-20)) )^2; } } } } } #using the instances in the train file, each test instance gets its distances calculated function calculateTestDistance(){ #find the distance of each test instance from all train instances for (testInstanceCounter=1; testInstanceCounter<=TestRecordsRead; testInstanceCounter++) { for (trainInstanceCounter=1; trainInstanceCounter<=TrainRecordsRead; trainInstanceCounter++) { TestDistance[testInstanceCounter,trainInstanceCounter] = 0; #it does not use the actual effort since it should not be known in testing for (fieldCounter=1; fieldCounter<=TotalFields-1; fieldCounter++) { if (TrainInput[trainInstanceCounter,fieldCounter]!=Ignore && TestInput[testInstanceCounter,fieldCounter]!=Ignore) TestDistance[testInstanceCounter,trainInstanceCounter]+=( (TrainInput[trainInstanceCounter,fieldCounter]-TestInput[testInstanceCounter,fieldCounter]) / (TestMax[testInstanceCounter,fieldCounter]-TestMin[testInstanceCounter,fieldCounter]+10^(-20)) )^2; } } } } #sort each core instance's distances and find the best neighborhood that minimized the sum of mre's function findTrainMre(){ for (trainCoreCounter=1; trainCoreCounter<=TrainRecordsRead; trainCoreCounter++) { #prepare for sorting the array for each core instance for (trainInstanceCounter=1; trainInstanceCounter<=TrainRecordsRead; trainInstanceCounter++) { sortedTrainDistanceArray[trainInstanceCounter,1] = TrainDistance[trainCoreCounter,trainInstanceCounter]; sortedTrainDistanceArray[trainInstanceCounter,2] = TrainInput[trainInstanceCounter,NF]; } #sorting the array using insertion sort for (i = 1; i <= TrainRecordsRead; i++) { tempDistance = sortedTrainDistanceArray[i,1]; tempEffort = sortedTrainDistanceArray[i,2]; j = i; #the second part of the while loop condition assures that we sort on tempEffort as well when two or more records have equal distances #this allows for selection of smaller effort values when distances are equal in the neighborhood selection while ((j > 1) && (sortedTrainDistanceArray[j-1,1] > tempDistance || (sortedTrainDistanceArray[j-1,1]==tempDistance && sortedTrainDistanceArray[j-1,2]>tempEffort))) # while ((j > 1) && (sortedTrainDistanceArray[j-1,1] > tempDistance)) { sortedTrainDistanceArray[j,1] = sortedTrainDistanceArray[j-1,1]; sortedTrainDistanceArray[j,2] = sortedTrainDistanceArray[j-1,2]; j = j - 1; } sortedTrainDistanceArray[j,1] = tempDistance; sortedTrainDistanceArray[j,2] = tempEffort; } #try every neighborhood bounded by the number of records in the training file for (neighborhoodCounter=2; neighborhoodCounter<=TrainRecordsRead; neighborhoodCounter++) { sum = 0; #take the average of the effort of the closest number of neighbors for (counter = 2; counter <= neighborhoodCounter; counter++) { sum+=sortedTrainDistanceArray[counter,2]; } #it is important to note that neighborhood is actually neighborhoodCounter - 1 since neighborhoodCounter = 1 results in a neighborhood of 0 #which is not wanted here (hence the counters start from 2 in these loops). neighborhood = neighborhoodCounter - 1; #now average is the new estimate (and the core itself was counted in the neighborhood but it is not needed in finding the sum) estimate = sum/(neighborhood); #actual value is the value from Input array in the last field actual = TrainInput[trainCoreCounter,NF]; #find the mre mre = (actual-estimate)/actual; mre = (mre < 0 ? -1*mre : mre); totalMre[neighborhood]+=mre; } } } #for each test instance, it uses the MinNeighborhood to find the estimate function findTestEstimate(){ for (testInstanceCounter=1; testInstanceCounter<=TestRecordsRead; testInstanceCounter++) { #prepare for sorting the array for each core instance for (trainInstanceCounter=1; trainInstanceCounter<=TrainRecordsRead; trainInstanceCounter++) { sortedTestDistanceArray[trainInstanceCounter,1] = TestDistance[testInstanceCounter,trainInstanceCounter]; sortedTestDistanceArray[trainInstanceCounter,2] = TrainInput[trainInstanceCounter,NF]; } #sorting the array using insertion sort for (i = 1; i <= TrainRecordsRead; i++) { tempDistance = sortedTestDistanceArray[i,1]; tempEffort = sortedTestDistanceArray[i,2]; j = i; #the second part of the while loop condition assures that we sort on tempEffort as well when two or more records have equal distances #this allows for selection of smaller effort values when distances are equal in the neighborhood selection while ((j > 1) && (sortedTestDistanceArray[j-1,1] > tempDistance || (sortedTestDistanceArray[j-1,1]==tempDistance && sortedTestDistanceArray[j-1,2]>tempEffort))) # while ((j > 1) && (sortedTestDistanceArray[j-1,1] > tempDistance)) { sortedTestDistanceArray[j,1] = sortedTestDistanceArray[j-1,1]; sortedTestDistanceArray[j,2] = sortedTestDistanceArray[j-1,2]; j = j - 1; } sortedTestDistanceArray[j,1] = tempDistance; sortedTestDistanceArray[j,2] = tempEffort; } #print the first MinNeighborhoods' average from the sorted array sum = 0; for (counter = 1; counter <= MinNeighborhood; counter++) { sum+=sortedTestDistanceArray[counter,2]; } estimate = sum/(MinNeighborhood); actual = TestInput[testInstanceCounter,NF]; print estimate, actual, MinNeighborhood; } } #now that the training is done, it will do the testing with the instances in the second file FILENAME==ARGV[2]{ if (DoneOnce!=1) { calculateTrainDistance(); findTrainMre(); for (neighborhoodCounter=1; neighborhoodCounter<=TrainRecordsRead-1; neighborhoodCounter++) { totalMre[neighborhoodCounter] = totalMre[neighborhoodCounter]/TrainRecordsRead; if (totalMre[neighborhoodCounter] < MinMreValue) { MinMreValue = totalMre[neighborhoodCounter]; MinNeighborhood = neighborhoodCounter; } } DoneOnce = 1; #reset NR NR = 1; } #read each test instance and determine the overall min and max for each test instances (individually) and all train instances (already determined) for (fieldCounter=1; fieldCounter<=TotalFields; fieldCounter++) { TestInput[NR,fieldCounter]=$fieldCounter; if (TrainMax[fieldCounter] > $fieldCounter) TestMax[NR,fieldCounter] = TrainMax[fieldCounter]; else TestMax[NR,fieldCounter] = $fieldCounter; if (TrainMin[fieldCounter] < $fieldCounter) TestMin[NR,fieldCounter] = TrainMin[fieldCounter]; else TestMin[NR,fieldCounter] = $fieldCounter; } TestRecordsRead++; #the MinNeighborhood was determined and can be used } END{ calculateTestDistance(); findTestEstimate(); }