#!/usr/bin/gawk -f #Note: this assumes that the last attribute is the class attribute BEGIN { FS = OFS = ","; NotKnown = "?"; NormalK = 10; ChunkK = 4; TooMuch = 2; TopBins = 2; BestBinCountRatio = 0.2; UsefulFieldCount = 6; Seed = 1; } function findMinMax() { for (field = 1; field <= totalFields; field++) { if (numericFields[field] == 1) { Min[field] = Data[1,field]; Max[field] = Data[1,field]; } } for (instance = 1; instance <= totalInstances; instance++) { for (field = 1; field <= totalFields; field++) { if (numericFields[field] == 1 && Data[instance,field] != NotKnown) { if (Data[instance,field] < Min[field]) Min[field] = Data[instance,field]; if (Data[instance,field] > Max[field]) Max[field] = Data[instance,field]; } } } } function initBins() { for (field = 1; field <= totalFields; field++) { if (numericFields[field] == 1) { TotalBins[field] = NormalK; for (binCounter = 1; binCounter <= TotalBins[field]; binCounter++) { Breaks[field,binCounter] = Min[field] + (binCounter - 1) * (Max[field] - Min[field])/TotalBins[field]; Counts[field,binCounter] = 0; } } } } #IMPORTANT: less than min and greater than max bin ranges should be handled such that they change max and min #NOTE: don't just create a new bin on the left and right bu adjusting Min[field] and Max[field]. The breaks are determined using Min[field] and Max[field] function findBin(field,value,inputBreaks,inputTotalBins) { #this returns a bin number that contains the value passed to the function if (value <= inputBreaks[field,1]) return 1; else if (value >= inputBreaks[field,inputTotalBins[field]]) return inputTotalBins[field]; else { #find the correct bin using bin chop algorithm leftBin = 1; rightBin = inputTotalBins[field]; while (leftBin < rightBin) { middleBin = int((leftBin + rightBin) / 2); if (value > inputBreaks[field,middleBin]) leftBin = middleBin + 1; else if (value < inputBreaks[field,middleBin]) rightBin = middleBin; else return middleBin; } return leftBin - 1; } } function fillBins() { for (instance = 1; instance <= totalInstances; instance++) { for (field = 1; field <= totalFields; field++) { if (numericFields[field] == 1 && Data[instance,field] != NotKnown) { #increase the right bin's count bin = findBin(field,Data[instance,field],Breaks,TotalBins); Counts[field,bin]++; #if the count it too high (measure according the equation below), split the bin and make adjustments #IMPORTANT: is this measure using instances in the bottom of both sides? If so, they cancel out! anny other measure? if (Counts[field,bin] > TooMuch*TotalBins[field]) { #make a break point in the middle of the current bin (upper bound is the same as lower bound of the next bin if exists or Max) if (bin == TotalBins[field]) newBreakPoint = (Breaks[field,bin] + Max[field])/2; else newBreakPoint = (Breaks[field,bin] + Breaks[field,bin+1]) / 2; #shift the bins one up for (binCounter = TotalBins[field]; binCounter > bin; binCounter--) { Breaks[field,binCounter+1] = Breaks[field,binCounter]; Counts[field,binCounter+1] = Counts[field,binCounter]; } #make adjustments (simply divide the number of counts between the new bin and the old bin) TotalBins[field]++; Breaks[field,bin+1] = newBreakPoint; Counts[field,bin+1] = Counts[field,bin] / 2; Counts[field,bin] = Counts[field,bin] / 2; } } } } } function chunkBins() { #this puts the normal bins into chunked bins so that they can be categorized into a smaller number of bins for (field = 1; field <= totalFields; field++) { if (numericFields[field] == 1) { newBinCounter = 1; newBreaks[field,1] = Breaks[field,1]; newCounts[field,1] = Counts[field,1]; for (binCounter = 2; binCounter <= TotalBins[field]; binCounter++) { #the limit for bin count is specified here. if it is greater than the limit, it will be placed in the next bin if (newCounts[field,newBinCounter] + Counts[field,binCounter] <= totalInstances / ChunkK) newCounts[field,newBinCounter] += Counts[field,binCounter]; else { newBinCounter++; newBreaks[field,newBinCounter] = Breaks[field,binCounter]; newCounts[field,newBinCounter] = Counts[field,binCounter]; } } newTotalBins[field] = newBinCounter; } } } function findBestBins() { #take the class field and take its top N bins's Breaks' lower bound as the lower bound for being in the best class value classField = totalFields; bestBreak = newBreaks[classField,newTotalBins[classField]-TopBins+1]; for (instance = 1; instance <= totalInstances; instance++) { #if the class value is above the best break, each field's bin in the whole instance should be in the best bin list if (Data[instance,classField] >= bestBreak) { best++; for (field = 1; field <= totalFields; field++) { if (numericFields[field] == 1 && Data[instance,field] != NotKnown) { bin = findBin(field,Data[instance,field],newBreaks,newTotalBins); bestBins[field,bin]++; #also keep track of the max number of a (field,bin) pair's count if (bestBins[field,bin] > MaxBestBinCount) MaxBestBinCount = bestBins[field,bin]; } } } } } function curveBestBins() { srand(Seed); randomRatio = rand(); tempCounter = 0; #sum up the counts in all best bins for (field = 1; field <= totalFields-1; field++) { if (numericFields[field] == 1) { for (binCounter = 1; binCounter <= newTotalBins[field]; binCounter++) { tempCounter++; tempArray[tempCounter,1] = bestBins[field,binCounter]; tempArray[tempCounter,2] = field; tempArray[tempCounter,3] = binCounter; } } } tempArraySize = tempCounter; #this is a very slow sort. Needs to be optimized later for (i = 1; i <= tempArraySize; i++) { tempCount = tempArray[i,1]; tempField = tempArray[i,2]; tempBinCounter = tempArray[i,3]; j = i; while ((j > 1) && (tempArray[j-1,1] > tempCount)) { tempArray[j,1] = tempArray[j-1,1]; tempArray[j,2] = tempArray[j-1,2]; tempArray[j,3] = tempArray[j-1,3]; j = j - 1; } tempArray[j,1] = tempCount; tempArray[j,2] = tempField; tempArray[j,3] = tempBinCounter; } #generate the cumulative sum for each one for (tempCounter = 1; tempCounter <= tempArraySize; tempCounter++) { sumArray[tempCounter,1] = sumArray[tempCounter-1,1] + tempArray[tempCounter,1]; sumArray[tempCounter,2] = tempArray[tempCounter,2]; sumArray[tempCounter,3] = tempArray[tempCounter,3]; } #this is used for reporting the results of best ranges of each field for (field = 1; field <= totalFields-1; field++) selectedValue[field] = "?"; #in this part, it selects the best range of each field (if above the threshold) or leave it as "?" to fill it in with the model later MinSumValue = sumArray[1,1]; MaxSumValue = sumArray[tempArraySize,1]; for (tempCounter = tempArraySize; tempCounter >= 1; tempCounter--) { # print sumArray[tempCounter,1]; if (sumArray[tempCounter,1] >= MinSumValue + (MaxSumValue - MinSumValue) * randomRatio && seenField[sumArray[tempCounter,2]]==0) { #only choose the top bin for each field and not the rest, even though it is above the threshold seenField[sumArray[tempCounter,2]] = 1; # print "field =" sumArray[tempCounter,2] " and bin = " sumArray[tempCounter,3]; field = sumArray[tempCounter,2]; bin = sumArray[tempCounter,3]; lowerBound = newBreaks[field,bin]; if (bin == newTotalBins[field]) upperBound = Max[field]; else upperBound = newBreaks[field,bin+1]; srand(++Seed); randomRange = rand(); selectedValue[field] = lowerBound + (upperBound - lowerBound) * randomRange; # print "a random number in this range of ["lowerBound","upperBound"] is: "selectedValue[field]; } } for (field = 1; field <= totalFields-2; field++) printf ("%s,",selectedValue[field]); print selectedValue[field]; } END { MaxBestBinCount = 0; findMinMax(); # for (field = 1; field <= totalFields; field++) # if (numericFields[field] == 1) # print "Field #" field " has min="Min[field] " and max="Max[field]; initBins(); fillBins(); # for (field = 1; field <= totalFields; field++) # { # if (numericFields[field] == 1) # for (binCounter = 1; binCounter <= TotalBins[field]; binCounter++) # print "Bin #"binCounter " starts at "Breaks[field,binCounter] " and has " Counts[field,binCounter] " values"; # } chunkBins(); # for (field = 1; field <= totalFields; field++) # { # if (numericFields[field] == 1) # for (binCounter = 1; binCounter <= newTotalBins[field]; binCounter++) # print "for field #" field " chunked Bin #"binCounter " starts at "newBreaks[field,binCounter] " and has " newCounts[field,binCounter] " values"; # } findBestBins(); # for (field = 1; field <= totalFields; field++) # { # if (numericFields[field] == 1) # for (binCounter = 1; binCounter <= newTotalBins[field]; binCounter++) # print "for field #" field " bin #" binCounter " has been best " bestBins[field,binCounter]+0 " times"; # } curveBestBins(); }