dnl BEGIN { Klass=-1; Ignore="/?/" Factor=2; Dull=0.5 Enough=1 ; } BEGIN { srand(); DefNumber = /continuou|real|integer/; IGNORECASE=1; OFS=";" CONVFMT = "%.10f" } define(_key,$1) define(_value,-1*$1) define(_nump,-1*$1) define(_size,0) { sub(/[\#\%].*$/,"") } /^$/ { next } /@/ { Header= Header "\n" $0} /@attribute/ { Last=$0; def(Attr,$2, $3) } /@data/ { FS=","; MinScore=scores(Last,Scores); } /@/ { next } { gsub(/[\t ]/,"") } { keep(Data,Uniques,Freq) } END { main(1,Data,Attr,Freq,Uniques, klass(Klass,Attr),Scores,MinScore*0.99999) } function klass(k,a) { return k < 0 ? k + a[_size] + 1 : k } function keep(d,u,f, max,i) { max= ++d[_size]; for(i=1;i<=NF;i++) { d[i,max]=$i; if ($i !~ Ignore) if( ++f[i,$i] == 1 ) u[i,++u[i,_size]] = $i } } function def(attr,name,about, nump) { n = ++attr[_size]; attr[n] = name; attr[_nump(n)] = about ~ DefNumber; } function scores(str,out, n,tmp,i) { gsub(/[{},]/," ",str); sub(/[ \t]*$/,"",str); n=split(str,tmp,/[\t ]+/); for(i=3;i<=n;i++) out[tmp[i]]=Factor^(i-3) return Factor } function main(r,d,a,f,u,k,scores,mins, ok,dull,value) { values(d,a,f,u,k,scores,value) okdull(r,u,k,mins,value,ok,dull) printok(a,ok) #saya("dull",dull) select(d,ok) } function values(d,a,f,u,k,scores,value, imax,jmax,j,score,i,total) { jmax=d[_size()]; imax=NF; for(j=1;j<=jmax;j++) { score = scores[d[k,j]]; for(i=1;i<=imax;i++) if (i != k ) { total += score; value[i,d[i,j]] += score; }} return total; } function okdull(r,u,k,mins,value,ok,dull, \ minv,maxv,vs, n,enough,col,symbols,symbol,i,j,l,m,v,tmp,tmp1,cols,syms, \ score2col,score2sym,sum,aa,bb,ab) { minv=10^38 maxv = -1*minv; for(col=1;col<=NF;col++) { if (col != k) { symbols = u[col,_size]; for(i=1;i<=symbols;i++) { symbol=u[col,i] v = value[col,symbol] ; minv = v < minv ? v : minv maxv = v > maxv ? v : maxv m++; tmp[m] = v; cols[m]=col syms[m]=symbol }}} # for(m in tmp) { # v=tmp1[m] = (tmp[m] - minv)/(maxv - minv) # vs += v; # score2col[v]= cols[m]; # totalcol[cols[m]] += v; # score2sym[v]= syms[m] # } for(m in tmp) { v=tmp1[m] = (tmp[m] - minv)/(maxv - minv) totalcol[cols[m]] += v; } for(m in tmp) { ab=totalcol[cols[m]]; aa = tmp1[m]/ab v = tmp2[m] = aa # aa/(1-aa) + rand()*0.000001 #print "%%",Attr[cols[m]],syms[m],"aa",aa,"v",v,"vs",vs,"dull",Dull vs += v; score2col[v]=cols[m]; score2sym[v]=syms[m]; } enough=vs * Dull n=asort(tmp2); #saya("tmp",tmp1); #saya("tmp",tmp) #saya("score2col",score2col) #saya("score2sym",score2sym) sum[1]=tmp2[1]; for(i=2;i<=n;i++) { j = tmp2[i] col = score2col[j]; symbol = score2sym[j]; #print "%%,col",col,"symbol",symbol, "totalcol["col"]",int(100*j/totalcol[col]) sum[i] = sum[i-1]+j; if (sum[i] >= enough) { l=++ok[_size] ok[_key(l)]=col ok[_value(l)]=symbol } else {dull[r,col,symbol]=1} } } function select(d,ok, j,max) { max=d[_size]; print Header; for(j=1;j<=max;j++) if (select1(d,j,ok)) printit(d,j) } function printok(a,ok, i, max,com) { max=ok[_size]; for(i=1;i<=max;i++) print "% " a[ok[_key(i)]] " == " ok[_value(i)] } function printit(d,j, i) { printf("%s",d[1,j]); for(i=2;i<=NF;i++) printf(",%s",d[i,j]) printf("\n"); } function select1(d,j,ok, i,m,n,goals,col,want,got,com) { max=ok[_size] for(i=1;i<=max;i++) { col =ok[_key(i)] want=ok[_value(i)]; got =d[col,j]; goals[col] += (got == want ) }; for(i in goals) { n++ m += (goals[i] ? 1 : 0) } return m/n >= Enough }