# assumes data has been pruned already # and we know min max # Tmp=`mktemp -d` # trap "rm -rf $Tmp" 0 1 2 3 15 # gawk -f prune.awk $1 > $Tmp/pruned # (gawk -f maxmin.awk $Tmp/pruned ; cat $Tmp/pruned) | gawk -f discrete.awk BEGIN { FS=OFS=","; Klass=-1; IGNORECASE =1; DontKnow = "?"; Inf = 10^32; K1 = 64; K2 = 4; Best = 1; split("",B,""); # breakpoints split("",N,""); # counts at each break split("",D,""); # incrementally sorted N TooMuch=2; OFMT = "%.20f" } /@relation/ { Attr=0; next; } /@attribute/ { split($0,Tmp,/[ \t]+/); Attr++; Name[Attr]=Tmp[2]; next; } /@data/ { Klass= Klass < 0 ? Attr + 1 + Klass : Klass; next } /^!/ { initBKN($2,$3,$4); next} function initBKN(i,min,max, k,j) { Range[asMin(i)]=min; Range[asMax(i)]=max; k = (max - min)/K1; N[i,0]=K1; for(j=1;j<=K1;j++) { B[i,j]=min + (j-1)*k; N[i,j]=0; } } { Instances++; for(I=1;I<=NF;I++) { Data[Instances,I]=$I; if (numericp(I)) if ($I != DontKnow) store(I) } } function numericp(x) { return x > 0 && x in Range } function asMax(x) { return x } function asMin(x) { return -1 * x} function where(i, bins) { if ($i == B[i,1] ) return 1; if ($i == B[i,bins]) return bins; return binchop(B,i,$i,bins) } function store(i, here,bins,start,stop,j,newBreak,new) { start = 1; bins = stop = N[i,0]; here = where(i,bins); new = ++N[i,here]; if (new/Instances > TooMuch*bins/Instances) { N[i,0]++; newBreak = (B[i,here+1] + B[i,here])/2 for(j=bins;j> here; j--) { B[i,j+1] = B[i,j]; N[i,j+1] = N[i,j]; } B[i,here+1] = newBreak; N[i,here+1] = new/2; N[i,here] = new/2; } } function chunks(b,n, i) { for(i=1;i<=Attr;i++) if (numericp(i)) chunk(i,Instances/K2,b,n) } function chunk(i,size,b,n, count,chunks,j) { stop = N[i,0]; b[i,1] = B[i,1] n[i,1] = N[i,1]; chunks = 1; for(j=2;j<=stop;j++) { if (n[i,chunks] + N[i,j] > size) b[i,++chunks] = B[i,j] n[i,chunks] += N[i,j]; } n[i,0]=chunks } function debug( i) { print "Instances", Instances for(i=1;i<=Attr;i++) if (numericp(Attr)) { print " " print N[i,0] say("B",B,i,N[i,0]); say("N",N,i,N[i,0]); } } function debug1(b,n, i) { print "Instances", Instances for(i=1;i<=Attr;i++) if (numericp(Attr)) { print " " print i,n[i,0] say("b",b,i,n[i,0]); say("n",n,i,n[i,0]); } } function say(str,a,i,n, j) { for(j=1;j<=n;j++) print str "["i"," j "]=" a[i,j] } function binchop(a,key,x,most, left,middle,right) { left=1; right=most; while (left < right) { middle=int((left+right)/2); if (x> a[key SUBSEP middle]) {left=middle+1} else {right=middle}; }; if (x=a[key SUBSEP left]) {return left} else {return 0}; } END { main(Data) } function main(raw, b,n,cooked) { chunks(b,n); discretize(b,n,raw,cooked); dump(cooked); } function dump(cooked, i,j) { for(i=1;i<=Instances;i++) { printf("%s",cooked[i,1]) for(j=2;j<=Attr;j++) printf(",%s",cooked[i,j]) print "" } } function discretize(b,n,raw,cooked, i,j) { for(i=1;i<=Instances;i++) { for(j=1;j<=Attr;j++) cooked[i,j] = discretize1(b,n,raw,i,j) cooked[i,Klass] = bestp(n[Klass,0],cooked[i,Klass]) } } function discretize1(b,n,raw,i,j) { if (raw[i,j] ~ DontKnow) return DontKnow return numericp(j) ? binchop(b,j,raw[i,j],n[j,0]) : raw[i,j] } function bestp(max,val) { #print "max ",max," val ", val," max-Best ",max-Best return val > max-Best } # for(i in N) print i " " N[i] | "sort"; close("sort") # for(i in B) print i " " B[i] | "sort"; close("sort")