#!/sw/bin/gawk -f BEGIN { FS=","; Missing="?" ; Goal="yes" ; Klass=-1 ; E = 2.718281828 ; Inf=10^30 ; IGNORECASE=1 ; OFMT="%.10f" ; _ = SUBSEP ; OFS="," } /#/ {for(I=1;I<=NF;I++) Name[++Name[0]]=$I if (Klass < 0 ) Klass= Name[0] + 1 + Klass next } {train($Klass==Goal)} END {lor()} function train(class, i) { Instances++ H[class]++ for(i=1;i<=NF;i++) if (i != Klass) if ($i != Missing) { N[class,Name[i],$i]++ Seen[Name[i],$i] } } function lor( i,a,n) { for(i in Seen) Lor[i] = log(( N[1 _ i] / (H[1] + 1/Inf)) / (N[0 _ i ] /(H[0] + 1 /Inf))) + some(); n=sorted(Lor,a) fudge= 100/a[-1*n] printf("%-40s %4s %5s %5s %s %s\n", "x=y","LOR","points","p(good)","use?","p(used)") printf("--------------------------------------- ----- ------ ----- --- ------\n") for(i=n;i>=1;i--) { use=0 tmp = a[-1*i] keys(a[i],allkeys) if ( (++used[allkeys[1]] == 1)) { use=1 } else { if ( reported[allkeys[1],allkeys[2]+1] || reported[allkeys[1],allkeys[2]-1]) { use="*" }} if (use) { pnew=points2p(lors + tmp,H[1]/Instances) if( (pnew - plast) > 0.01 ) { lors += tmp reported[allkeys[1],allkeys[2]]=1 best = pnew } else { use = 0 } } plast = pnew; printf("| %-40s %.5s %5s %.4f %s %.4s\n", commas(a[i]), tmp, int(tmp*fudge), points2p(tmp, H[1]/Instances), (use ? use : "X") , pnew ) } } function sorted(a0,a ,old,k,b,c,i,n) { for(k in a0) back[a0[k] ]= k n=asort(a0) a[0]=n for(i=1;i<=n;i++) { a[-1*i] = a0[i] a[i] = back[a0[i]] } return n } function saya(pre,a, i,com) { com="sort #" rand() for(i in a) print pre "[" commas(i) "] = " a[i] | com close(com) } function points2p(f,p) { return 1 / (1 + E^(-1*log(p/(1 - p)) - f )) } function commas(s) { gsub( _ ,",",s); return s } function key1(s, a) { split(s,a,_); return a[1] } function key2(s, a) { split(s,a,_); return a[2] } function keys(s,a) { split(s,a,_) } function some() { return 1/(rand() * Inf) }