BEGIN{ NumBins= 3 } {gsub(/%.*/,"")} /^[ \t]*$/ { next } /@relation/ { Relation = $2 } /@attribute/ { initCol() } /@class/ { initClass() } In { readRow() } /^@data/ { In=1 } /^@/ { next } function initCol() { Columns[++Cols] = $2 } function initClass() { Columns[++Cols] = $2 Class[Cols] = 1 } function readRow() { ++Rows for (c=1; c<= Cols; c++) { if (Seen[$c,c]) { } else { ++Seen[$c,c] Uniques[++Uniques[c],c] = $c } Data[Rows,c] = $c } } function sortUniques( tmp) { split("",tmp,"") for (c = 1; c <= Cols; c++) { for(u=1; u<=Uniques[c]; u++) tmp[u] = Uniques[u,c] asort(tmp) for(u=1; u<=Uniques[c]; u++) Uniques[u,c] = tmp[u] } } function getBins( c,b,cutoff) { for (c = 1; c<= Cols; c++) { cutoff = int(Uniques[c]/NumBins) for(b=1; b<=NumBins; b++) { Bin[b,c] = Uniques[cutoff*b,c] } } } function mapBins( tmp) { cutoff = int(Rows/NumBins)+1 for (c = 1; c<= Cols; c++) { split("",tmp,"") for (r=1; r<= Rows; r++) tmp[r] = Data[r,c] asort(tmp) b = 1 for (r = 1; r<= Rows; r++) { Rosetta[tmp[r],c] = Bin[b,c] if(r%cutoff==0) { b++ } } } } function printBins() { for (r=1; r<= Rows; r++) { for (c=1; c<= Cols; c++) { if (Uniques[c] <= 1.5 * NumBins || Class[c]) printf Data[r,c]" " else printf Rosetta[Data[r,c],c]" " } print "" } } function printHeader() { print "@relation "Relation"-discretized\n" for (c = 1; c <= Cols; c++) { if (Class[c]) { split("",tmp,"") printf "@class "Columns[c]" " for (r=1; r<=Uniques[c]; r++) tmp[r] = Uniques[r,c] asort(tmp) for (r=1; r<=Uniques[c]; r++) printf tmp[r]" " print "" } else { if (Uniques[c] <= 1.5 * NumBins) { printf "@attribute "Columns[c]" " for (r=1; r<=Uniques[c]; r++) printf Uniques[r,c]" " print "" } else { printf "@attribute "Columns[c]" " for (b=1; b<=NumBins; b++) printf Bin[b,c]" " print "" } } } print "\n@data" } END { sortUniques() getBins() mapBins() printHeader() printBins() }