BEGIN { FS=OFS=","; Prune=1; Klass= -1; IGNORECASE =1; DontKnow = "?"; Inf = 10^32; Bins = 10; Rest = 8; OFMT = "%.20f" } Prune { sub(/\%.*/,"") } /^[ \t]*$/ { next } /@relation/ && Pass> 1 { Klass= Klass < 0 ? Attr + 1 + Klass : Klass } /@relation/ { Attr=0; } /@attribute/ { split($0,Tmp,/[ \t]+/); Attr++; Name[Attr]=Tmp[2];} Pass==1 && /@attribute/ && (/numeric/ || /real/ || /integer/) { Range[asMax(Attr)] = -1*Inf; Range[asMin(Attr) ] = Inf; } /@[rda]/ { next} Pass==1 { for(I in Range) if (I > 0) { if ( $I ~ DontKnow ) continue; if ( $I > Range[asMax(I)]) Range[asMax(I)]=$I; if ( $I < Range[asMin(I)]) Range[asMin(I)]=$I; }} Pass==2 { N++; BestOrRest= bestp(); for(I=1;I<=NF;I++) if (I != Klass) record(N,I,$I,discretize(I),BestOrRest); record(N,Klass,$Klass,BestOrRest,BestOrRest) } function numericp(x) { return x in Range } function asMax(x) { return x } function asMin(x) { return -1 * x} function discretize(i, min,max, bin) { if ($i ~ DontKnow) return $i; if (! (i in Range)) return $i; min=Range[asMin(i)]; max=Range[asMax(i)]; bin = 1 + int(($i - min)/((max - min)/Bins)); return bin > Bins ? Bins : bin } function bestp() { #print "kkk",Klass,"min",Range[asMin(Klass)],\ # "kl",$Klass,"max",Range[asMax(Klass)],\ # "d",discretize(Klass), "r",Rest, "p",(discretize(Klass) > Rest) return discretize(Klass) > Rest } function record(n,attr,raw,range,class, i) { Data[n,attr]=range; Raw[ n,attr]=raw; F[class,attr,range]++ if (! ((attr,range) in Ar2i)) { i= ++I2ar[size()] I2ar[asAttr(i )]= attr; I2ar[asRange(i)]= range Ar2i[attr,range]= i } } function size() { return 0} function asAttr(i) { return i} function asRange(i) { return -1 * i} function nowTimesPrior(evidence,class,classes,n, \ f,nUs,pUs,out,i,j,prior,attr,range,delta,counts) { nUs = F[class,Klass,class] ; prior = nUs/n ; #print " " for(i in evidence) { j = evidence[i]; attr = I2ar[asAttr( j)]; range = I2ar[asRange(j)]; counts[attr] += (class,attr,range) in F ? F[class,attr,range] : 0; #print "step",step,"nUs",nUs,"prior",prior,"attr",attr,\ # "range",range,"f",f,"nUs",nUs,"delta",delta,"out",out } out = prior; for(attr in counts) out *= counts[attr]/nUs return out; } # XXXX. lost some experxted outouts function useful(i, attr,range) { if (i <= 0) return 0; attr = I2ar[asAttr( i)]; range = I2ar[asRange(i)]; if (attr == Klass) return 0; if ((1,attr,range) in F) return 1; return 0; } function score(evidence, lbest,lrest,out) { lbest = nowTimesPrior(evidence,1,2,N); lrest = nowTimesPrior(evidence,0,2,N); out = lbest^2 / (lbest + lrest); return out } function sortScores(scores,sorts, i,n) { delete scores; delete sorts; for(i in I2ar) if(useful(i)) getScores(scores,i) n = asorti(scores,sorts); #showScores(n,scores,sorts); return n; } function getScores(scores,i, e,s) { e[i]=i s = score(e); if (s) scores[s+rand()/10000]=i } function showScores(n,scores, sorts, i,x,tmp,attr,range) { for(x=1;x<=n;x++) { i = scores[sorts[x]]; attr=I2ar[asAttr(i)]; range=I2ar[asRange(i)]; print "attr",attr,"range",range,"sorts[x]",sprintf("%.20f",sorts[x]),\ "scores[sorts[x]]",scores[sorts[x]] } for(x in sorts ) print "sorts: " x " " sorts[x] for(x in scores) print "scores: " x " " scores[x] } function try(n,scores,sorts) { for(i=1;i<=n;i++) { try1(i,n,scores,sorts) } } # have to handle disjunctions function try1(i,n,scores,sorts, attrs,attr,range,str,evidence,j,k,some) { for(j=n;j > n-i; j--) { k=scores[sorts[j]] attr = I2ar[asAttr(k )]; range = I2ar[asRange(k)]; ranges[attr]++ if (ranges[attr] < Bins) { some = 1; evidence[k] = k; str[j] = str[j] " " attr"="range } } #for(k in evidence) print k, I2ar[asAttr(k)], I2ar[asRange(k)]; if (some) print i,score(evidence),test(evidence) ,str2label(str); } function str2label(strs, str,i,n) { n=asort(strs); str=strs[1]; for(i=2;i<=n;i++) str= str " " strs[i] return str } function test(evidence, max,i,score,n) { for(i=1;i<=Attr;i++) { max += Data[Klass,i] if(selected(i,evidence)) { score += Data[Klass,i] }} return score/max; } function selected(i,evidence, x,attr,range,e,unsatisfied) { for(e in evidence) { attr = I2ar[asAttr( e)]; unsatisfied[attr]=1 } for(e in evidence) { attr = I2ar[asAttr( e)]; range = I2ar[asRange(e)]; x = Data[i,attr] if ( (x == range) || (x ~ DontKnow) ) unsatisfied[attr]=0 } for (attr in unsatisfied) if (unsatisfied[attr]) return 0 return 1 } function main( n, scores, sorts) { n = sortScores(scores,sorts); #showScores(n,scores,sorts); try(n,scores, sorts); #showData() } function showData( i,j,sep) { for(j=1;j<=N;j++) { sep=""; for(i=1;i<=Attr;i++) { printf("%s%s",sep,Data[j,i]) sep="," } for(i=1;i<=Attr;i++) { printf("%s%s",sep,Raw[j,i]) sep="," } print ""; } } END { main() } #attr,3,range,1,sorts[x],0.03032239999999999935,scores[sorts[x]],12 #attr,1,range,2,sorts[x],0.12124100000000000155,scores[sorts[x]],6 #attr,3,range,2,sorts[x],0.12126099999999999379,scores[sorts[x]],3 #attr,4,range,3,sorts[x],0.16365199999999999192,scores[sorts[x]],8 #attr,2,range,3,sorts[x],0.16365399999999999392,scores[sorts[x]],2 #attr,3,range,3,sorts[x],0.16372100000000000541,scores[sorts[x]],7 #attr,4,range,2,sorts[x],0.20456900000000000084,scores[sorts[x]],4 #attr,2,range,2,sorts[x],0.20460400000000000809,scores[sorts[x]],11 #attr,1,range,1,sorts[x],0.29099000000000002641,scores[sorts[x]],1 #sorts: 4 0.163652 #sorts: 5 0.163654 #sorts: 6 0.163721 #sorts: 7 0.204569 #sorts: 8 0.204604 #sorts: 9 0.29099 #sorts: 1 0.0303224 #sorts: 2 0.121241 #sorts: 3 0.121261 #scores: 0.163721 7 #scores: 0.163654 2 #scores: 0.29099 1 #scores: 0.204604 11 #scores: 0.121241 6 #scores: 0.121261 3 #scores: 0.0303224 12 #scores: 0.204569 4 #scores: 0.163652 8