#!/usr/bin/gawk -f # /* vim: set filetype=awk : */ -*- awk -*- ### patterns0: initialization stuff BEGIN{ BinLog=0; # if non-zero, use bin logging after rounding numbers Bins=10; FS=OFS=","; IGNORECASE=1; Inf=10**32; } ### patterns1: skip blanks and comments {sub(/\%.*/,"")} /^[ \t]*$/ {next} ### patterns2: react to arff keywords /@relation/ { Header=1; Data=0; Attr=0; } /@data/ { Header=0; Data=1; } /@attribute/ { Attr++; } ### patterns3: for the first pass # react to arff keywords Pass==1 && /@attribute/ && (/numeric/ || /real/ || /integer/) { Numeric[Attr]= 1; Max[Attr] = -1*Inf; Min[Attr] = Inf; } # handle the data section for pass one Pass==1 && Data && NF > 1{ for(I in Numeric) { if ( $I ~ /?/ ) continue; if ($I > Max[I] ) Max[I]=$I; if ($I < Min[I] ) Min[I]=$I; if (! Seen[I,$I]) Unique[I]++; Seen[I,$I]++; } } ### patterns4: for the first pass # initializations for start of second pass Pass==2 && /@relation/ { for(I in Numeric) { # Div[I]= BinLog ? round(log(Unique[I])) : Bins; Div[I]=10 Bin[I]=(Max[I]-Min[I])/ Div[I]; print "% attribute:" I "=-=min:"Min[I] "=-=max:"Max[I] \ "=-=bins:"Div[I] "=-=steps:"Bin[I]} } # handle the attribute keywords Pass==2 && /@attribute/ { if ( (Attr in Numeric) && Attr != MaxAttr ) { Names=""; for(I=2;I<=Div[Attr];I++) Names=Names",_"I sub(/integer|numeric|real/,"{_1"Names"}"); } } # handle the data section for pass two # rewrite the fields on the whole line Pass==2 && Data && NF> 1{ for(I in Numeric) { if ( I == MaxAttr ) continue; if ( $I ~ /?/ ) continue; $I="_"label(I,$I)}; } # rewrite the rewritten record Pass==2 {print $0} # functions function round(x) { return int(x + (x<0 ? -0.5 : 0.5)) } function label(i,val, x) { if ( Bin[i]==0 ) return 1; x=round((val-Min[i])/Bin[i]); if (x==0) {return 1} else {return x}; }