BEGIN { # command-line options Samples = 50 K1 = 5 K2 = 15 Seed = 1 Tests = 0.33 Nomograms=0.66 AutoStop = 1 Report = 0 OutFile = "log.txt" } BEGIN { # internal options OFS="," IGNORECASE=1 Inf = 10^32 _ = SUBSEP CONVFMT="%.8g" } function main() { worker(Samples,K1,K2) } function worker(samples, k1,k2, rankeds, ranked) { print "samples : " samples print "k1 : " k1 print "k2 : " k2 print "%test : " Tests*100 "" printf "Contrast Algo : " print Nomograms ? "Nomograms" : "B-Squared" print "" print "Training results on " Train[0] " historical examples (what looks useful):" rankeds = train(samples,k1,k2,ranked) print "Test results on " Test[0] " new projects (applying the training results to new data):\n" test( samples,k1,k2,rankeds,ranked) } ############################### ## Training: Rank Attributes ## ############################### function train(samples,k1,k2,ranked, \ projects, neighbors, memos, best, rest,\ knearest,rankeds) { # inputs outputs projects(Train,samples, projects) # example1 projects neighbors(samples,projects,Train[0],Train, neighbors,memos) # distances example1 to Train set knn(k1+k2,samples,neighbors,memos, knearest) # knearest Train instance row numbers to example1 projects bestRest(knearest,k1, best,rest) # divide knearest into best/worst rankeds = rank(k1,k2,best,rest, ranked) # contrast set between best/worst return rankeds } #################################### ## Testing: Apply Best Attributes ## #################################### function test(samples,k1,k2,rankeds,ranked, \ i,projects,neighbors,memos,knearest,\ m,n,sorted,kloc,row,col,data) { Median = Inf Spread = Inf for(run=0; run<=rankeds && !Stop; run++) { split("",projects,"") split("",neighbors,"") split("",memos,"") split("",knearest,"") split("",sorted,"") split("",kloc,"") if (run>1) { split(ranked[run],tmp,SUBSEP) constraint = tmp[1] cval = tmp[2] for(j=1;j<=Test["range",constraint,0];j++) { delete Test["range",constraint,j] } Test["range",constraint,0] = 1 Test["range",constraint,1] = cval } projects(Test,samples, projects) # different example2 projects neighbors(samples,projects,Test[0],Test, neighbors,memos) # distances example2 to Test set knn(k1+k2,samples,neighbors,memos, knearest) # knearest Test instances row numbers to example2 projects for(row=1;row<=Test[0];row++) if (row in knearest) { data[0]++ split("",rowKlasses,"") for (k in Klasses) { rowKlasses[++r] = Test[row,Klasses[k]] } kloc[++n]= int(score(rowKlasses)) for(col=1;col<=Cols;col++) data[data[0],col]=Test[row,col] # convert row numbers to their data rows } m=asort(kloc,sorted) # report baseline distributions if (run == 0) { baseMedian = findMedian(sorted) baseSpread = find75(sorted) - find25(sorted) } prevSpread = Spread prevMedian = Median Spread = find75(sorted) - find25(sorted) Median = findMedian(sorted) if (Median >= prevMedian && Spread >= prevSpread && run > 1) Stop = 1 # print "Baseline (estimates without any project changes): " # print "\t\t\t\t25%\t50%\t75%" # if (!Stop) { printf "Query "run":\t\t"find25(sorted) "\t" findMedian(sorted) "\t" find75(sorted)"\t\t20*{" for(i=1;i<=m;i++) printf("%s ", sorted[i]) print "}" # } } print "@@@"Relation"."ProjName","(baseMedian - prevMedian)/baseMedian","(baseSpread-prevSpread)/baseSpread # print "MedianReduction: " ((baseMedian - prevMedian) / baseMedian) * 100 # saya(Test,"test") # split("",Previous,"") # split("",Baseline,"") # Previous[0] = m # for (k=1; k<=Previous[0]; k++) { # Previous[k] = sorted[k] # Baseline[k] = sorted[k] # } # print "\nResults of applying the top n-th ranges found during training\n" # selects(k1,data,rankeds,ranked) # try the tricks found during training on the knearest Test instances } ####################################### ## Parse Project and Historical Data ## ####################################### { gsub(/%.*/,"") } /^[ \t]$/ { next } /^@project/ { In = 0 } In { rand() <= Tests ? cells(Test,Cols) : cells(Train,Cols) } /^@relation/ { Relation=$2 } /^@attribute/ { def($2) } /^@class/ { defclass($2) } /^@data/ { In = 1; inits(Cols) } /^@/ { next } function inits(cols, i) { # Klass = Klass < 0 ? cols + Klass + 1 : Klass srand(Seed ? Seed : 1) for(i=1;i<=cols;i++) { Train["max",i]= -1*Inf; Train["min",i]=Inf } for(i=1;i<=cols;i++) { Test[ "max",i]= -1*Inf; Test[ "min",i]=Inf } } function def(name, a,i,goalp) { goalp = sub(/?/,"",name) if (name in Name) { a = Name[name] } else { a = Name[name] = ++Cols Eman[Cols]=name } if (Train["range",a,0]) clearStack(Train, "range" _ a) clearStack(Test, "range" _ a) for(i=3;i<=NF;i++) { Train["range",a, ++Train["range",a,0]] = $i Test[ "range",a, ++Test[ "range",a,0]] = $i } if (goalp) Goal[a]=1 } function defclass(name) { if (name in Name) { a = Name[name] } else { a = Name[name] = ++Cols Eman[Cols]=name } if (Train["range",a,0]) clearStack(Train, "range" _ a) clearStack(Test, "range" _ a) for(i=3;i<=NF;i++) { Train["range",a, ++Train["range",a,0]] = $i Test[ "range",a, ++Test[ "range",a,0]] = $i } Klasses[name] = Name[name] } function clearStack(a, key, i, max) { if (max = a[ key _ 0 ]) for(i=1;i<=max;i++) delete a[ key _ i ] a[key _ 0] = 0 } function cells(data,cols, col) { data[0]++ for(col=1;col<=cols;col++) { data[data[0],col] = $col data["max",col] = max(data["max",col],$col) data["min",col] = min(data["min",col],$col) } }