#!/usr/bin/gawk -f BEGIN { FS = OFS = ","; NotKnown = "?"; NormalK = 80; ChunkK = 10; TooMuch = 2; TopBins = 1; Seed = 0; TotalFields = 6; RunCount = 100; OFMT="%10.10g" Policy=3; } function model(a,b,c,d,e) { # return a*b*c*d*e; # return a+b+c+d+e; # y=(b+3)^3+3*(b+3)^2-2*(b+3)+1#+b-c-d+e; # if (y<84) # return y; # else # return 0; y = 2 - (1-b)^2 + 4 - (2 - d)^2; return y; } END { srand(Seed); init(); for (era = 1; era <= 20; era++) { findMinMax(); initBins(); fillBins(); chunkBins(); findBestBins(); # clean the first (normal) layer of data which is only the Data array and get ready to put the new instances in delete Data; for (run = 1; run <= RunCount; run++) { curveBestBins(); a = selectedValue[1]; b = selectedValue[2]; c = selectedValue[3]; d = selectedValue[4]; e = selectedValue[5]; #if these values are equal to "?" they were not determined from the curve. otherwise, they are cached and can be used if (a == "?") a = x_a(); if (b == "?") b = x_b(); if (c == "?") c = x_c(); if (d == "?") d = x_d(); if (e == "?") e = x_e(); InputArray[1] = a; InputArray[2] = b; InputArray[3] = c; InputArray[4] = d; InputArray[5] = e; InputArray[6] = score = model(a,b,c,d,e); addInstance(InputArray); if (score > maxScore) { maxScore = score; bestResults[1]=a; bestResults[2]=b; bestResults[3]=c; bestResults[4]=d; bestResults[5]=e; bestResults[6]=score; bestResults[7]=era; } if (score > maxEraScore[era]) { maxEraScore[era] = score; eraResults[1]=a; eraResults[2]=b; eraResults[3]=c; eraResults[4]=d; eraResults[5]=e; eraResults[6]=score; eraResults[7]=era; } } print "best of era #"eraResults[7],eraResults[1],eraResults[2],eraResults[3],eraResults[4],eraResults[5],eraResults[6]; # clean the second (chunk) layer of data which is all other arrays but the Data array and get ready for next chunking delete Max; delete Min; delete TotalBins; delete Breaks; delete Counts; delete ChunkedTotalBins; delete ChunkedBreaks; delete ChunkedCounts; delete bestBins; delete tempArray; delete sumArray; delete topBin; delete seenField; delete selectedValue; delete goodBins; } print "overall best is at era #"bestResults[7],bestResults[1],bestResults[2],bestResults[3],bestResults[4],bestResults[5],bestResults[6]; } function x_a() {return real(1,100)} function x_b() {return radian() } function x_c() {return real(1,100)} function x_d() {return radian() } function x_e() {return real(1,100)} function radian() { return rand()*2*1068966896/340262731; } function between(min,max) { if (max == min) return min; else return min+(max-min)*rand(); } function real(min,max) { return between(min,max); } function round(x) { return int(x+0.5); } function init() { maxScore = -(10^(20)); #NOTE: this needs more attention for (field = 1; field <= TotalFields; field++) NumericFields[field] = 1; for (counter = 1; counter <= RunCount; counter++) { a = x_a(); b = x_b(); c = x_c(); d = x_d(); e = x_e(); InputArray[1] = a; InputArray[2] = b; InputArray[3] = c; InputArray[4] = d; InputArray[5] = e; InputArray[6] = score = model(a,b,c,d,e); addInstance(InputArray); if (score > maxScore) { maxScore = score; bestResults[1]=a; bestResults[2]=b; bestResults[3]=c; bestResults[4]=d; bestResults[5]=e; bestResults[6]=score; bestResults[7]=era; } } } function addInstance(InputArray) { Data[0,0]++; for (field = 1; field <= TotalFields; field++) Data[Data[0,0],field] = InputArray[field]; } function findMinMax() { for (field = 1; field <= TotalFields; field++) { if (NumericFields[field] == 1) { Min[field] = Data[1,field]; Max[field] = Data[1,field]; } } for (instance = 1; instance <= Data[0,0]; instance++) { for (field = 1; field <= TotalFields; field++) { if (NumericFields[field] == 1 && Data[instance,field] != NotKnown) { if (Data[instance,field] < Min[field]) Min[field] = Data[instance,field]; if (Data[instance,field] > Max[field]) Max[field] = Data[instance,field]; } } } } function initBins() { for (field = 1; field <= TotalFields; field++) { if (NumericFields[field] == 1) { TotalBins[field] = NormalK; for (binCounter = 1; binCounter <= TotalBins[field]; binCounter++) { Breaks[field,binCounter] = Min[field] + (binCounter - 1) * (Max[field] - Min[field])/TotalBins[field]; Counts[field,binCounter] = 0; } } } } function findBin(field,value,inputBreaks,inputTotalBins) { #this returns a bin number that contains the value passed to the function if (value <= inputBreaks[field,1]) return 1; else if (value >= inputBreaks[field,inputTotalBins[field]]) return inputTotalBins[field]; else { #find the correct bin using bin chop algorithm leftBin = 1; rightBin = inputTotalBins[field]; while (leftBin < rightBin) { middleBin = int((leftBin + rightBin) / 2); if (value > inputBreaks[field,middleBin]) leftBin = middleBin + 1; else if (value < inputBreaks[field,middleBin]) rightBin = middleBin; else return middleBin; } return leftBin - 1; } } function fillBins() { for (instance = 1; instance <= Data[0,0]; instance++) { for (field = 1; field <= TotalFields; field++) { if (NumericFields[field] == 1 && Data[instance,field] != NotKnown) { #increase the right bin's count bin = findBin(field,Data[instance,field],Breaks,TotalBins); Counts[field,bin]++; #if the count it too high (measure according the equation below), split the bin and make adjustments if (Counts[field,bin] > TooMuch*TotalBins[field]) { #make a break point in the middle of the current bin (upper bound is the same as lower bound of the next bin if exists or Max) if (bin == TotalBins[field]) newBreakPoint = (Breaks[field,bin] + Max[field])/2; else newBreakPoint = (Breaks[field,bin] + Breaks[field,bin+1]) / 2; #shift the bins one up for (binCounter = TotalBins[field]; binCounter > bin; binCounter--) { Breaks[field,binCounter+1] = Breaks[field,binCounter]; Counts[field,binCounter+1] = Counts[field,binCounter]; } #make adjustments (simply divide the number of counts between the new bin and the old bin) TotalBins[field]++; Breaks[field,bin+1] = newBreakPoint; Counts[field,bin+1] = Counts[field,bin] / 2; Counts[field,bin] = Counts[field,bin] / 2; } } } } } function chunkBins() { #this puts the normal bins into chunked bins so that they can be categorized into a smaller number of bins for (field = 1; field <= TotalFields; field++) { if (NumericFields[field] == 1) { newBinCounter = 1; ChunkedBreaks[field,1] = Breaks[field,1]; ChunkedCounts[field,1] = Counts[field,1]; for (binCounter = 2; binCounter <= TotalBins[field]; binCounter++) { #the limit for bin count is specified here. if it is greater than the limit, it will be placed in the next bin if (ChunkedCounts[field,newBinCounter] + Counts[field,binCounter] <= Data[0,0] / ChunkK) ChunkedCounts[field,newBinCounter] += Counts[field,binCounter]; else { newBinCounter++; ChunkedBreaks[field,newBinCounter] = Breaks[field,binCounter]; ChunkedCounts[field,newBinCounter] = Counts[field,binCounter]; } } ChunkedTotalBins[field] = newBinCounter; } } } function findBestBins() { #take the class field and take its top N bins's Breaks' lower bound as the lower bound for being in the best class value classField = TotalFields; bestBreak = ChunkedBreaks[classField,ChunkedTotalBins[classField]-TopBins+1]; for (instance = 1; instance <= Data[0,0]; instance++) { #if the class value is above the best break, each field's bin in the whole instance should be in the best bin list if (Data[instance,classField] >= bestBreak) { for (field = 1; field <= TotalFields; field++) { if (NumericFields[field] == 1 && Data[instance,field] != NotKnown) { bin = findBin(field,Data[instance,field],ChunkedBreaks,ChunkedTotalBins); bestBins[field,bin]++; } } } } } function curveBestBins() { randomRatio = rand(); tempCounter = 0; #sum up the counts in all best bins for (field = 1; field <= TotalFields-1; field++) { if (NumericFields[field] == 1) { for (binCounter = 1; binCounter <= ChunkedTotalBins[field]; binCounter++) { tempCounter++; tempArray[tempCounter,1] = bestBins[field,binCounter]; tempArray[tempCounter,2] = field; tempArray[tempCounter,3] = binCounter; } } } tempArraySize = tempCounter; #NOTE: this is a very slow sort. Needs to be optimized later for (i = 1; i <= tempArraySize; i++) { tempCount = tempArray[i,1]; tempField = tempArray[i,2]; tempBinCounter = tempArray[i,3]; j = i; while ((j > 1) && (tempArray[j-1,1] > tempCount)) { tempArray[j,1] = tempArray[j-1,1]; tempArray[j,2] = tempArray[j-1,2]; tempArray[j,3] = tempArray[j-1,3]; j = j - 1; } tempArray[j,1] = tempCount; tempArray[j,2] = tempField; tempArray[j,3] = tempBinCounter; } #generate the cumulative sum for each one for (tempCounter = 1; tempCounter <= tempArraySize; tempCounter++) { sumArray[tempCounter,1] = sumArray[tempCounter-1,1] + tempArray[tempCounter,1]; sumArray[tempCounter,2] = tempArray[tempCounter,2]; sumArray[tempCounter,3] = tempArray[tempCounter,3]; } #this is used for reporting the results of best ranges of each field for (field = 1; field <= TotalFields-1; field++) { #NOTE: include these also in the prototype selectedValue[field] = "?"; seenField[field] = 0; topBin[field] = 0; goodBins[field,0] = 0; } #in this part, it selects the best range of each field (if above the threshold) or leave it as "?" to fill it in with the model later MinSumValue = sumArray[1,1]; MaxSumValue = sumArray[tempArraySize,1]; #THIS IS POLICY 1: It randomly selects one of the best bins (the ones above the threshold) for each field and then selects a random number in that range if (Policy == 1) { for (tempCounter = tempArraySize; tempCounter >= 1; tempCounter--) { if (sumArray[tempCounter,1] >= MinSumValue + (MaxSumValue - MinSumValue) * randomRatio) { field = sumArray[tempCounter,2]; goodBins[field,++goodBins[field,0]] = sumArray[tempCounter,3]; } } for (field = 1; field <= TotalFields - 1; field++) { if (goodBins[field,0] != 0) { randomBin = int(rand()*(goodBins[field,0]-1)+1); bin = goodBins[field,randomBin]; lowerBound = ChunkedBreaks[field,bin]; if (bin == ChunkedTotalBins[field]) upperBound = Max[field]; else upperBound = ChunkedBreaks[field,bin+1]; randomRange = rand(); selectedValue[field] = lowerBound + (upperBound - lowerBound) * randomRange; } } } #THIS IS POLICY 2: Best bin with minimum bin# is selected for each field. Then a random number from bin's break up to max value for that field is selected. if (Policy == 2) { for (tempCounter = tempArraySize; tempCounter >= 1; tempCounter--) { if (sumArray[tempCounter,1] >= MinSumValue + (MaxSumValue - MinSumValue) * randomRatio) { field = sumArray[tempCounter,2]; if (topBin[field] == 0) topBin[field] = sumArray[tempCounter,3]; else if (topBin[field] < sumArray[tempCounter,3]) topBin[field] = sumArray[tempCounter,3]; } } for (field = 1; field <= TotalFields - 1; field++) { if (topBin[field] != 0) { bin = topBin[field]; lowerBound = ChunkedBreaks[field,bin]; upperBound = Max[field]; randomRange = rand(); selectedValue[field] = lowerBound + (upperBound - lowerBound) * randomRange; } } } #THIS IS POLICY 3: First bin seen when going down the sum values is selected for each field and then a random number in that bin's range (only) is selected. if (Policy == 3) { for (tempCounter = tempArraySize; tempCounter >= 1; tempCounter--) { if (sumArray[tempCounter,1] >= MinSumValue + (MaxSumValue - MinSumValue) * randomRatio && seenField[sumArray[tempCounter,2]] == 0) { #only choose the top bin for each field and not the rest, even though they are above the threshold seenField[sumArray[tempCounter,2]] = 1; field = sumArray[tempCounter,2]; bin = sumArray[tempCounter,3]; lowerBound = ChunkedBreaks[field,bin]; if (bin == ChunkedTotalBins[field]) upperBound = Max[field]; else upperBound = ChunkedBreaks[field,bin+1]; randomRange = rand(); selectedValue[field] = lowerBound + (upperBound - lowerBound) * randomRange; } } } }