#!/usr/bin/gawk -f BEGIN{ FS=OFS=","; IGNORECASE=1; bins=10; dataLine=0; attr=0; data=0; min=10**32;; max=-10**32; classField=0; } {sub(/\%.*/,"")} /^[ \t]*$/ {next} /%/ {next;} /@relation/{ print $0; } /@attribute/{ attrs[attr++] = $0; } /@data/{ dataLine=1; attr--; for ( i = 0; i < attr; i++ ) print attrs[i]; if ( attrs[attr] ~ /real|continuous|integer|number/ ) { sub( /real|continuous|integer|number/, "", attrs[attr] ); attrs[attr]=attrs[attr] "_1_"; for( i = 2; i <= bins; i++ ) attrs[attr] = attrs[attr] ",_" i "_"; } print attrs[attr]; print $0; next; } dataLine==1{ datas[data] = $1; classes[data] = $NF; for( d = 2; d < NF; d++ ) datas[data] = datas[data] "," $d; if ( $NF !~ /\?/ ) { max = max < $NF ? $NF : max; min = min > $NF ? $NF : min; } classField=NF; data++; } END step = int(( max - min )/bins); for ( d = 0; d < data; d++ ) { binNo = int(classes[d]/step); binNo = binNo < 1 ? 1 : binNo; binNo = binNo > bins ? bins : binNo; print datas[d] ",_" binNo "_"; } }