# no more From To. need to pull at random BEGIN { # command-line options Samples = 20 K1 = 5 K2 = 15 Seed = 1 Klass = -1 Tests = 0.33 } BEGIN { # internal options OFS="," IGNORECASE=1 Inf = 10^32 _ = SUBSEP CONVFMT="%.8g" } ################################################################## # main program function main() { worker(Samples,K1,K2) } function worker(samples, k1,k2, rankeds, ranked) { print "samples : " samples print "k1 : " k1 print "k2 : " k2 print "%test : " Tests*100 "\n" print "Training results on " Train[0] " historical examples (what looks useful):" rankeds = train(samples,k1,k2,ranked) saya(ranked,"ranked") print "Test results on " Test[0] " new projects (applying the training results to new data):\n" test( samples,k1,k2,rankeds,ranked) } function train(samples,k1,k2,ranked, \ projects, neighbors, memos, best, rest,\ knearest,rankeds) { # inputs outputs # ------ ------- projects(Train,samples, projects) # example1 projects neighbors(samples,projects,Train[0],Train, neighbors,memos) # distances example1 to Train set knn(k1+k2,samples,neighbors,memos, knearest) # knearest Train instance row numbers to example1 projects bestRest(knearest,k1, best,rest) # divide knearest into best/worst rankeds = rank(k1,k2,best,rest, ranked) # contrast set between best/worst return rankeds } function test(samples,k1,k2,rankeds,ranked, \ i,projects,neighbors,memos,knearest,\ m,n,sorted,kloc,row,col,data) { projects(Test,samples, projects) # different example2 projects neighbors(samples,projects,Test[0],Test, neighbors,memos) # distances example2 to Test set knn(k1+k2,samples,neighbors,memos, knearest) # knearest Test instances row numbers to example2 projects for(row=1;row<=Test[0];row++) if (row in knearest) { data[0]++ kloc[++n]= int(Test[row,Klass]) for(col=1;col<=Cols;col++) data[data[0],col]=Test[row,col] # convert row numbers to their data rows } m=asort(kloc,sorted) # report baseline distributions print "Baseline (estimates without any project changes): " for(i=1;i<=m;i++) printf("%s ", sorted[i]) print "\n\nResults of applying the top n-th ranges found during training\n" selects(k1,data,rankeds,ranked) # try the tricks found during training on the knearest Test instances } ################################################################## # read in data { gsub(/%.*/,"") } /^[ \t]$/ { next } /^@project/ { In = 0 } In { rand() <= Tests ? cells(Test,Cols) : cells(Train,Cols) } /^@relation/ { Relation=$2 } /^@attribute/ { def($2) } /^@data/ { In = 1; inits(Cols) } /^@/ { next } function inits(cols, i) { Klass = Klass < 0 ? cols + Klass + 1 : Klass srand(Seed ? Seed : 1) for(i=1;i<=cols;i++) { Train["max",i]= -1*Inf; Train["min",i]=Inf } for(i=1;i<=cols;i++) { Test[ "max",i]= -1*Inf; Test[ "min",i]=Inf } } function def(name, a,i,goalp) { goalp = sub(/?/,"",name) if (name in Name) { a = Name[name] } else { a = Name[name] = ++Cols Eman[Cols]=name } if (Train["range",a,0]) clearStack(Train, "range" _ a) clearStack(Test, "range" _ a) for(i=3;i<=NF;i++) { Train["range",a, ++Train["range",a,0]] = $i Test[ "range",a, ++Test[ "range",a,0]] = $i } if (goalp) Goal[a]=1 } function clearStack(a, key, i, max) { if (max = a[ key _ 0 ]) for(i=1;i<=max;i++) delete a[ key _ i ] a[key _ 0] = 0 } function cells(data,cols, col) { data[0]++ for(col=1;col<=cols;col++) { data[data[0],col] = $col data["max",col] = max(data["max",col],$col) data["min",col] = min(data["min",col],$col) } } ################################################################## # generate projects function projects(data,news,new, row,col,v) { new[0]=news for(col=1;col<=Cols;col++) { new["max",col]= -1*Inf new["min",col]= Inf for(row=1;row<=news;row++) { v = projectValue(data,col) new[row,col] = v new["max",col] = max(new["max",col],v) new["min",col] = min(new["min",col],v) } } } function projectValue(data,a, max,one) { max = data["range",a,0] one = int(rand() * max) + 1 return data["range",a,one] } ################################################################## # find the k-th nearest historial projects near the generated projects function euclidean(row1,row2,data1,data2, n,col,d,d1,d2) { for(col=1;col<=Cols;col++) if (col != Klass) { d1 = normalize(data1,col,data1[row1,col]) d2 = normalize(data2,col,data2[row2,col]) d += abs(d1 - d2)^2 n++ } return sqrt(d)/sqrt(n) } function distance(row1,row2,data1,data2,memo, d) { d = as100(euclidean(row1,row2,data1,data2)) memo[-1 * d] = row1 # d started at row1 memo[d] = row2 # d ended at row2 return d } function normalize(data,col,n, min,max,d) { min = data["min",col] max = data["max",col] d = min == max ? 1 : (n - min) /(max - min) #print "n " n " min " min " max " max " d " d return d } function neighbors(news,new,olds,old,neighbor,memo, o,n) { for(n=1;n<=news;n++) for(o=1;o<=olds;o++) push2(distance(n,o,new,old,memo),neighbor,n) } function knn(k,news,neighbor,memo,ks, dist,n,most,i,d,sorted) { for(n=1;n<=news;n++) { most = neighbor[n,0] for(i = 1;i <= most;i++) dist[++d] = neighbor[n,i] } knnDebug(dist,memo) asort(dist,sorted) for(i=1;i<=d;i++) { n = memo[sorted[i]] if ( ++ks[n]==1 ) k-- if (k == 0) return i } return k } function knnDebug(dist,memo, com,i) { if (KnnDebug) { com = "sort -n -k 2 | " KnnDebug for(i in dist) print memo[dist[i]] " " dist[i] | com close(com) } } ################################################################## # divide the k-th nearest historial projects into best (lowest) # estiamted and the rest. collect frequency counts for best and rest. # rank attribute ranges by how common they are in best and how # rare they are in rest function bestRest(rows,border,best,rest, enough,n,scores,row) { for(row in rows) scores[++n]= Train[row,Klass] asort(scores) enough = scores[border] for(row in rows) if ( Train[row,Klass] <= enough ) count(row,best,rows[row]) else count(row,rest,rows[row]) } function count(row,f,n, col) { f[0]++ for(col in Goal) f[col,Train[row,col]] += n } function rank(k1,k2,best,rest,ranked, \ range,bests,rests,i,b,r,score,scores,sorted,memo,max) { bests = best[0] rests = rest[0] for(i in best) if (i != 0) { b = best[i] / bests r = rest[i] / rests if(Nomograms) { if(best[i] < (bests*Nomograms)) continue score = log(b/(r+0.000001)) + rand()/10000# as100(b^2/(b+r)) } else { score=as100(b^2(b+r)) } scores[i] = score memo[score] = i if (RankDebug) print no_(i) "\t" " b=" b " r= " r " score=" score } max = asort(scores,sorted) if (Nomograms) showRanks(memo,sorted,max) for(i=max; i>=1;i--) # highest score must be forst ranked[max-i+1] = memo[sorted[i]] return max } function showRanks (memo,sorted,max, i, range,tmp,com) { com="sort -r -n | cat -n" print "#n\tscore\trange" print "------\t-----\t--------------" for(i=max; i>=1;i--) { # highest score must be forst range = memo[sorted[i]] split(range,tmp,_) print sprintf("%5.2f",sorted[i]) "\t" Eman[tmp[1]] " = " tmp[2] | com } close(com) print "" } ######################################################################### # select and report subset of relevant rows that satisfy constraints 1..n function selects(k1,data,rankeds,ranked, constraints,n) { for(n=1;n<=rankeds;n++) addNextConstraint(k1,n,data,ranked[n],constraints) } function addNextConstraint(k1,n,data,constraint,constraints, selected) { extendConstraint(constraint, constraints) selectRows(data,constraints,selected) report(k1,n,data,constraint, constraints, selected) } function extendConstraint(constraint,constraints, attrange,attr,range) { split(constraint, attrange,_); attr = attrange[1] range = attrange[2] if (attr in constraints) constraints[attr] = "(" range "|" substr(constraints[attr],2) else { constraints[attr] = "(" range ")" } if (SelectsDebug) saya(constraints,"constrants") } function selectRows(data,constraints,selected, row) { for(row=1;row<=data[0];row++) if ( selectRow(data,row,constraints) ) selected[row]=1 } function selectRow(data,row,constraints, col) { for(col in constraints) if ( constraints[col] !~ data[row,col] ) return 0 return 1 } function report(k1,n,data,constraint, constraints, selected, \ all,attr,range,attrange,row,tmp,i,str,sep,j,max,sorted) { split(constraint, attrange,_); attr = Eman[attrange[1]] range = attrange[2] for(row in selected) { all++ tmp[++i] = int(data[row,Klass]) } max=asort(tmp,sorted) for(j=1;j<=max;j++) { str = str sep sorted[j] sep=" " } printf(n==1 ? " " : "and ") printf("n="n ": " attr "= " constraints[attrange[1]] " :\t\t " \ sorted[round(i*0.25)] "\t" sorted[round(i*0.5)] "\t" sorted[round(i*0.75)] "\t" \ max ) print (all <= 30) ? "*{" str "}" : "*{..}" } ################################################################## # standard utils function barph(str) { print str >"/dev/stderr"; fflush("/dev/stderr") } function push2(v,a,i) { a[i,++a[i,0]] = v; return v } function push(v,a) { a[++a[0]] = v; return v } function as100(n) { return (n*100) + rand()/10 } function abs(n) { return n < 0 ? -1* n : n } function max(n1,n2) { return n1