BEGIN { # command-line options Samples = 20 K1 = 5 K2 = 15 Seed = 1 Tests = 0.33 AutoStop = 1 Report = 0 OutFile = "log.txt" } BEGIN { # internal options OFS="," IGNORECASE=1 Inf = 10^32 _ = SUBSEP CONVFMT="%.8g" } ################################################################## # main program function main() { worker(Samples,K1,K2) } function worker(samples, k1,k2, rankeds, ranked) { print "samples : " samples print "k1 : " k1 print "k2 : " k2 print "%test : " Tests*100 print "Contrast Method : " (Nomograms ? "Nomograms" : "BSquared") print "" print "Training results on " Train[0] " historical examples (what looks useful):" rankeds = train(samples,k1,k2,ranked) #saya(ranked,"ranked") print "Test results on " Test[0] " new projects (applying the training results to new data):\n" test( samples,k1,k2,rankeds,ranked) } function train(samples,k1,k2,ranked, \ projects, neighbors, memos, best, rest,\ knearest,rankeds) { # inputs outputs # ------ ------- projects(Train,samples, projects) # example1 projects # saya(projects,"P") neighbors(samples,projects,Train[0],Train, neighbors,memos) # distances example1 to Train set knn(k1+k2,samples,neighbors,memos, knearest) # knearest Train instance row numbers to example1 projects bestRest(knearest,k1, best,rest) # divide knearest into best/worst rankeds = rank(k1,k2,best,rest, ranked) # contrast set between best/worst return rankeds } function test(samples,k1,k2,rankeds,ranked, \ i,projects,neighbors,memos,knearest,\ m,n,sorted,kloc,row,col,data) { projects(Test,samples, projects) # different example2 projects neighbors(samples,projects,Test[0],Test, neighbors,memos) # distances example2 to Test set knn(k1+k2,samples,neighbors,memos, knearest) # knearest Test instances row numbers to example2 projects for(row=1;row<=Test[0];row++) if (row in knearest) { data[0]++ split("",rowKlasses,"") for (k in Klasses) { rowKlasses[++r] = Test[row,Klasses[k]] } kloc[++n]= int(score(rowKlasses)) for(col=1;col<=Cols;col++) data[data[0],col]=Test[row,col] # convert row numbers to their data rows } m=asort(kloc,sorted) # report baseline distributions # print "Baseline (estimates without any project changes): " for(i=1;i<=m;i++) printf("%s ", sorted[i]) print "\n\t\t\t\t25%\t50%\t75%" print "\t Baseline:\t\t"find25(sorted) "\t" findMedian(sorted) "\t" find75(sorted) split("",Previous,"") split("",Baseline,"") Previous[0] = m for (k=1; k<=Previous[0]; k++) { Previous[k] = sorted[k] Baseline[k] = sorted[k] } print "\nResults of applying the top n-th ranges found during training\n" selects(k1,data,rankeds,ranked) # try the tricks found during training on the knearest Test instances } ################################################################## # read in data { gsub(/%.*/,"") } /^[ \t]$/ { next } /^@project/ { In = 0 } In { rand() <= Tests ? cells(Test,Cols) : cells(Train,Cols) } /^@relation/ { Relation=$2 } /^@attribute/ { def($2) } /^@class/ { defclass($2) } /^@data/ { In = 1; inits(Cols) } /^@/ { next } function inits(cols, i) { # Klass = Klass < 0 ? cols + Klass + 1 : Klass srand(Seed ? Seed : 1) for(i=1;i<=cols;i++) { Train["max",i]= -1*Inf; Train["min",i]=Inf } for(i=1;i<=cols;i++) { Test[ "max",i]= -1*Inf; Test[ "min",i]=Inf } } function def(name, a,i,goalp) { goalp = sub(/?/,"",name) if (name in Name) { a = Name[name] } else { a = Name[name] = ++Cols Eman[Cols]=name } if (Train["range",a,0]) clearStack(Train, "range" _ a) clearStack(Test, "range" _ a) for(i=3;i<=NF;i++) { Train["range",a, ++Train["range",a,0]] = $i Test[ "range",a, ++Test[ "range",a,0]] = $i } if (goalp) Goal[a]=1 } function defclass(name) { if (name in Name) { a = Name[name] } else { a = Name[name] = ++Cols Eman[Cols]=name } if (Train["range",a,0]) clearStack(Train, "range" _ a) clearStack(Test, "range" _ a) for(i=3;i<=NF;i++) { Train["range",a, ++Train["range",a,0]] = $i Test[ "range",a, ++Test[ "range",a,0]] = $i } Klasses[name] = Name[name] } function clearStack(a, key, i, max) { if (max = a[ key _ 0 ]) for(i=1;i<=max;i++) delete a[ key _ i ] a[key _ 0] = 0 } function cells(data,cols, col) { data[0]++ for(col=1;col<=cols;col++) { data[data[0],col] = $col data["max",col] = max(data["max",col],$col) data["min",col] = min(data["min",col],$col) } }