#!/usr/bin/gawk -f ### This gawk script will convert an arff file with numeric and discrete data into an ### arff file with just discrete data. It will ensure that each bin has an equal ### number of numbers in it. Thus, creating a flat historgram with varying ranges in ### in bin. ### -Command-Line Arguments- ### nBins - The number of bins to use. ( Default=10 ) BEGIN{ IGNORECASE = 1; FS=OFS=","; nBins = 10; dataLine = 0; ## If this is 1, we are looking at data in the arff file. noInst = 0; ## The number of data instances. noAttrs = 0; ## The number of attributes. } ### Ignore comments. /%/ {next;} ### Just echo the relation line. /@relation/{print $0;} ### Set up the attribute information. /@attribute/{ attrs[noAttrs] = $0; noAttrs++; } /@data/{dataLine=1; next;} dataLine{ for ( f = 1; f <= NF; f++ ) attVals[noInst, f-1] = $f; noInst++; } END{ for ( a = 0; a < noAttrs; a++ ) { if ( a == noAttrs - 1 && attrs[a] ~ /real/ || attrs[a] ~ /integer/ || attrs[a] ~ /numeric/ ) { sub( /real|integer|numeric/, "", attrs[a] ); atStat[a] = 1; attrs[a] = attrs[a] "{'_1_'"; for ( n = 2; n <= nBins; n++ ) attrs[a] = attrs[a] ",'_" n "_'"; attrs[a] = attrs[a] "}"; } else { atStat[a] = 0; } print attrs[a]; } print "@data"; ### binCount - The number of items per bin. binCount = noInst/nBins; ### Go through each attribute and see if it is numeric. If it is, equal-frequency discretize it. for ( a = 0; a < noAttrs; a++ ) { ### If it is numeric. if ( atStat[a] ) { ### Create an array of the values of the numeric attribute that is sorted in ascending order. ### Call that array "sorted" and let its size be "n". for ( k = 0; k < noInst; k++ ) sorted[k] = attVals[k,a]; n = asort( sorted ); for ( k = 1; k <= n; k++ ) { ### binNo - The bin to put this piece of data into. binNo = round( k/binCount ); binNo = binNo < 1 ? 1 : binNo; binNo = binNo > nBins ? nBins : binNo; ### Go through the original data and replace the valCnt'th k-value with the bin. ### cnt - The number of times sorted[k] has been seen. cnt = 0; for ( i = 0; i < noInst; i++ ) { if ( attVals[i,a] == sorted[k] ) { attVals[i,a] = "_" binNo "_"; i = noInst; } } } } } ### Print the discretized data. for ( i = 0; i < noInst; i++ ) { line=attVals[i,0]; for ( a = 1; a < noAttrs; a++ ) { line = line "," attVals[i,a]; } print line; } } ### This will convert a real into an integer. function round( x ) { return int(x+0.5); }