#!/usr/bin/gawk -f BEGIN { FS = OFS = ","; NotKnown = "?"; NormalK = 10; ChunkK = 4; TooMuch = 2; TopBins = 2; Seed = 0; TotalFields = 6; OFMT="%10.10g" } END { init(); for (era = 0; era <= 10; era++) { print "this is era #"era; findMinMax(); initBins(); fillBins(); chunkBins(); findBestBins(); # clean the first (normal) layer of data which is only the Data array and get ready to put the new instances in delete Data; for (run = 1; run <= 1000; run++) { curveBestBins(); a = selectedValue[1]; b = selectedValue[2]; c = selectedValue[3]; d = selectedValue[4]; e = selectedValue[5]; srand(++Seed); #if these values are equal to "?" they were not determined from the curve. otherwise, they are cached and can be used if (a == "?") a = x_a(); if (b == "?") b = x_b(); if (c == "?") c = x_c(); if (d == "?") d = x_d(); if (e == "?") e = x_e(); score = (a^2/sin(sqrt(b)))*c*sin(d)*e; InputArray[0] = 6; InputArray[1] = a; InputArray[2] = b; InputArray[3] = c; InputArray[4] = d; InputArray[5] = e; InputArray[6] = score; addInstance(InputArray); print a,b,c,d,e,score; } # clean the second (chunk) layer of data which is all other arrays but the Data array and get ready for next chunking delete Max; delete Min; delete TotalBins; delete Breaks; delete Counts; delete newTotalBins; delete newBreaks; delete newCounts; delete bestBins; delete tempArray; delete sumArray; delete seenField; delete selectedValue; } print "best:"a,b,c,d,e,score; } function x_a() {return real(1,100)} function x_b() {return radian() } function x_c() {return real(1,100)} function x_d() {return radian() } function x_e() {return real(1,100)} function radian() { return rand()*2*1068966896/340262731; } function between(min,max) { if (max == min) return min; else return min+(max-min)*rand(); } function real(min,max) { return between(min,max); } function round(x) { return int(x+0.5); } function init() { for (counter = 1; counter <= 1000; counter++) { srand(++Seed); a = x_a(); b = x_b(); c = x_c(); d = x_d(); e = x_e(); score = (a^2/sin(sqrt(b)))*c*sin(d)*e; InputArray[1] = a; InputArray[2] = b; InputArray[3] = c; InputArray[4] = d; InputArray[5] = e; InputArray[6] = score; addInstance(InputArray); } } function addInstance(InputArray) { totalInstances++; for (field = 1; field <= TotalFields; field++) { Data[totalInstances,field] = InputArray[field]; #NOTE: this needs more attention numericFields[field] = 1; } } function findMinMax() { for (field = 1; field <= TotalFields; field++) { if (numericFields[field] == 1) { Min[field] = Data[1,field]; Max[field] = Data[1,field]; } } for (instance = 1; instance <= totalInstances; instance++) { for (field = 1; field <= TotalFields; field++) { if (numericFields[field] == 1 && Data[instance,field] != NotKnown) { if (Data[instance,field] < Min[field]) Min[field] = Data[instance,field]; if (Data[instance,field] > Max[field]) Max[field] = Data[instance,field]; } } } } function initBins() { for (field = 1; field <= TotalFields; field++) { if (numericFields[field] == 1) { TotalBins[field] = NormalK; for (binCounter = 1; binCounter <= TotalBins[field]; binCounter++) { Breaks[field,binCounter] = Min[field] + (binCounter - 1) * (Max[field] - Min[field])/TotalBins[field]; Counts[field,binCounter] = 0; } } } } #IMPORTANT: less than min and greater than max bin ranges should be handled such that they change max and min #NOTE: don't just create a new bin on the left and right by adjusting Min[field] and Max[field]. The breaks are determined using Min[field] and Max[field] function findBin(field,value,inputBreaks,inputTotalBins) { #this returns a bin number that contains the value passed to the function if (value <= inputBreaks[field,1]) return 1; else if (value >= inputBreaks[field,inputTotalBins[field]]) return inputTotalBins[field]; else { #find the correct bin using bin chop algorithm leftBin = 1; rightBin = inputTotalBins[field]; while (leftBin < rightBin) { middleBin = int((leftBin + rightBin) / 2); if (value > inputBreaks[field,middleBin]) leftBin = middleBin + 1; else if (value < inputBreaks[field,middleBin]) rightBin = middleBin; else return middleBin; } return leftBin - 1; } } function fillBins() { for (instance = 1; instance <= totalInstances; instance++) { for (field = 1; field <= TotalFields; field++) { if (numericFields[field] == 1 && Data[instance,field] != NotKnown) { #increase the right bin's count bin = findBin(field,Data[instance,field],Breaks,TotalBins); Counts[field,bin]++; #if the count it too high (measure according the equation below), split the bin and make adjustments #IMPORTANT: is this measure using instances in the bottom of both sides? If so, they cancel out! anny other measure? if (Counts[field,bin] > TooMuch*TotalBins[field]) { #make a break point in the middle of the current bin (upper bound is the same as lower bound of the next bin if exists or Max) if (bin == TotalBins[field]) newBreakPoint = (Breaks[field,bin] + Max[field])/2; else newBreakPoint = (Breaks[field,bin] + Breaks[field,bin+1]) / 2; #shift the bins one up for (binCounter = TotalBins[field]; binCounter > bin; binCounter--) { Breaks[field,binCounter+1] = Breaks[field,binCounter]; Counts[field,binCounter+1] = Counts[field,binCounter]; } #make adjustments (simply divide the number of counts between the new bin and the old bin) TotalBins[field]++; Breaks[field,bin+1] = newBreakPoint; Counts[field,bin+1] = Counts[field,bin] / 2; Counts[field,bin] = Counts[field,bin] / 2; } } } } } function chunkBins() { #this puts the normal bins into chunked bins so that they can be categorized into a smaller number of bins for (field = 1; field <= TotalFields; field++) { if (numericFields[field] == 1) { newBinCounter = 1; newBreaks[field,1] = Breaks[field,1]; newCounts[field,1] = Counts[field,1]; for (binCounter = 2; binCounter <= TotalBins[field]; binCounter++) { #the limit for bin count is specified here. if it is greater than the limit, it will be placed in the next bin if (newCounts[field,newBinCounter] + Counts[field,binCounter] <= totalInstances / ChunkK) newCounts[field,newBinCounter] += Counts[field,binCounter]; else { newBinCounter++; newBreaks[field,newBinCounter] = Breaks[field,binCounter]; newCounts[field,newBinCounter] = Counts[field,binCounter]; } } newTotalBins[field] = newBinCounter; } } } function findBestBins() { #take the class field and take its top N bins's Breaks' lower bound as the lower bound for being in the best class value classField = TotalFields; bestBreak = newBreaks[classField,newTotalBins[classField]-TopBins+1]; for (instance = 1; instance <= totalInstances; instance++) { #if the class value is above the best break, each field's bin in the whole instance should be in the best bin list if (Data[instance,classField] >= bestBreak) { best++; for (field = 1; field <= TotalFields; field++) { if (numericFields[field] == 1 && Data[instance,field] != NotKnown) { bin = findBin(field,Data[instance,field],newBreaks,newTotalBins); bestBins[field,bin]++; } } } } } function curveBestBins() { srand(Seed); randomRatio = rand(); tempCounter = 0; #sum up the counts in all best bins for (field = 1; field <= TotalFields-1; field++) { if (numericFields[field] == 1) { for (binCounter = 1; binCounter <= newTotalBins[field]; binCounter++) { tempCounter++; tempArray[tempCounter,1] = bestBins[field,binCounter]; tempArray[tempCounter,2] = field; tempArray[tempCounter,3] = binCounter; } } } tempArraySize = tempCounter; #this is a very slow sort. Needs to be optimized later for (i = 1; i <= tempArraySize; i++) { tempCount = tempArray[i,1]; tempField = tempArray[i,2]; tempBinCounter = tempArray[i,3]; j = i; while ((j > 1) && (tempArray[j-1,1] > tempCount)) { tempArray[j,1] = tempArray[j-1,1]; tempArray[j,2] = tempArray[j-1,2]; tempArray[j,3] = tempArray[j-1,3]; j = j - 1; } tempArray[j,1] = tempCount; tempArray[j,2] = tempField; tempArray[j,3] = tempBinCounter; } #generate the cumulative sum for each one for (tempCounter = 1; tempCounter <= tempArraySize; tempCounter++) { sumArray[tempCounter,1] = sumArray[tempCounter-1,1] + tempArray[tempCounter,1]; sumArray[tempCounter,2] = tempArray[tempCounter,2]; sumArray[tempCounter,3] = tempArray[tempCounter,3]; } #this is used for reporting the results of best ranges of each field for (field = 1; field <= TotalFields-1; field++) { selectedValue[field] = "?"; seenField[field] = 0; } #in this part, it selects the best range of each field (if above the threshold) or leave it as "?" to fill it in with the model later MinSumValue = sumArray[1,1]; MaxSumValue = sumArray[tempArraySize,1]; for (tempCounter = tempArraySize; tempCounter >= 1; tempCounter--) { if (sumArray[tempCounter,1] >= MinSumValue + (MaxSumValue - MinSumValue) * randomRatio && seenField[sumArray[tempCounter,2]]==0) { #only choose the top bin for each field and not the rest, even though it is above the threshold seenField[sumArray[tempCounter,2]] = 1; field = sumArray[tempCounter,2]; bin = sumArray[tempCounter,3]; lowerBound = newBreaks[field,bin]; if (bin == newTotalBins[field]) upperBound = Max[field]; else upperBound = newBreaks[field,bin+1]; srand(++Seed); randomRange = rand(); selectedValue[field] = lowerBound + (upperBound - lowerBound) * randomRange; } } # for (field = 1; field <= TotalFields-2; field++) # printf ("%s,",selectedValue[field]); # print selectedValue[field]; }