#!/usr/bin/gawk -f #Note: this assumes that the last attribute is the class attribute BEGIN { FS = OFS = ","; NotKnown = "?"; BestClassRatio = 0.1; Seed = 1; NewInstanceLimit = 20; } /@attribute/ { totalFields++; } /@attribute/ && (/numeric/ || /real/ || /integer/) { #this makes sure that only numeric attributes are used numericFields[totalFields] = 1; } /^$/ || /^%/ || /^@/ { next; } { gsub(/[ \t]/,""); totalInstances++; for (field = 1; field <= totalFields; field++) if (numericFields[field] == 1) Data[totalInstances,field] = $field; } function findMinMax() { for (field = 1; field <= totalFields; field++) { if (numericFields[field] == 1) { Min[field] = 10^20; Max[field] = -(10^20); } } for (instance = 1; instance <= totalInstances; instance++) { for (field = 1; field <= totalFields; field++) { if (numericFields[field] == 1 && Data[instance,field] != NotKnown) { if (Data[instance,field] < Min[field]) Min[field] = Data[instance,field]; if (Data[instance,field] > Max[field]) Max[field] = Data[instance,field]; } } } } function findBestInstances() { newInstanceCounter = 1; for (instance = 1; instance <= totalInstances; instance++) { if (Data[instance,totalFields] >= (1 - BestClassRatio) * Max[totalFields]) { for (field = 1; field <= totalFields; field++) { if (numericFields[field] == 1) bestData[newInstanceCounter,field] = Data[instance,field]; } newInstanceCounter++; } } bestData[0,0] = newInstanceCounter - 1; } function findBestRanges(field,tempRange) { tempInstanceCounter = 1; if (numericFields[field] == 1) { for (instance = 1; instance <= bestData[0,0]; instance++) { if (bestData[instance,field] != NotKnown) { tempRange[tempInstanceCounter] = bestData[instance,field]; tempInstanceCounter++; } } tempArraySize = asort(tempRange); #find its median if (tempArraySize % 2 == 0) median = (tempRange[tempArraySize / 2] + tempRange[tempArraySize / 2 + 1]) / 2; else median = tempRange[(tempArraySize + 1) / 2]; randomIndex = 1 + int(tempArraySize * rand()); # print "for field " field, " median="median " random index=" randomIndex " with value=" tempRange[randomIndex]; radius[field] = median - Min[field]; center[field] = tempRange[randomIndex]; } } function generateNewInstances() { instanceCounter = 1; while (instanceCounter <= NewInstanceLimit) { for (field = 1; field <= totalFields; field++) { if (numericFields[field] == 1) { randomValue = rand(); if (randomValue <= 0.33) newInstance[instanceCounter,field] = center[field] - radius[field]; else if (randomValue > 0.33 && randomValue <= 0.66) newInstance[instanceCounter,field] = center[field]; else newInstance[instanceCounter,field] = center[field] + radius[field]; } } instanceCounter++; } } END { srand(Seed); findMinMax(); # for (field = 1; field <= totalFields; field++) # if (numericFields[field] == 1) # print "Field #" field " has min="Min[field] " and max="Max[field]; findBestInstances(); # for (instance = 1; instance <= bestData[0,0]; instance++) # { # for (field = 1; field <= totalFields; field++) # if (numericFields[field] == 1) # printf("%s,", bestData[instance,field]); # print ""; # } # print ""; # print "best instances are " bestData[0,0] " out of " totalInstances; for (field = 1; field <= totalFields; field++) findBestRanges(field); generateNewInstances(); # print "these are the new instances:" for (instance = 1; instance <= NewInstanceLimit; instance++) { for (field = 1; field <= totalFields; field++) if (numericFields[field] == 1) printf("%s,", newInstance[instance,field]); print ""; } }