######################################################################### # select and report subset of relevant rows that satisfy constraints 1..n function apply(treatments,data, baseEstimateData,finalEstimateData,constraints,t,optimal,filteredData) { # saya(treatments,"TREATMETNS") # saya(Eman,"EMAN") #Save the original estimate data for (r=1; r<=data[0]; r++) { for (c=1; c<=Cols; c++) { baseEstimateData[r,c] = data[r,c] } } baseEstimateData[0] = data[0] getClassRanges(baseEstimateData, classRanges) scoreData(baseEstimateData, classRanges, baseRowToScore) baseMedian = findMedian(baseRowToScore) baseSpread = findSpread(baseRowToScore) if (Verbose) {print "\n\n---------------Queries---------------"} if (Verbose) {describeData("Query #0", baseEstimateData, baseRowToScore)} optimal=0 t=1 previousMedian = baseMedian previousSpread = baseSpread while(!optimal) { # -----input------ ---output--- addConstraint(treatments[t], constraints) filter(data, constraints, filteredData) split("",scores,"") scoreData(filteredData, classRanges, scores) filteredMedian = findMedian(scores) filteredSpread = findSpread(scores) #Stopping Rules if (!treatments[t]) { optimal=1 printf (Verbose ? "STOPPING. No more treatments left:\n" : "") } if (filteredData[0] < 3) { optimal=1 printf (Verbose ? "STOPPING. Next query too small:\n" : "" ) } if (filteredMedian >= previousMedian && filteredSpread >= previousSpread) { optimal=1 printf (Verbose ? "STOPPING. Next query shows no improvement:\n" : "" ) } if (t == 1) #first treatment always works optimal=0 previousMedian = filteredMedian previousSpread = filteredSpread # if (optimal) # print "^^^^^^^Optimal^^^^^^^^^" if (!optimal) { split("",finalEstimateData,"") split("",finalScores,"") for (r=1; r<=filteredData[0]; r++) { for (c=1; c<=Cols; c++) { finalEstimateData[r SUBSEP c] = filteredData[r SUBSEP c] } finalEstimateData[0] = filteredData[0] finalScores[r] = scores[r] } } if (Verbose) {describeData("Query #"t, filteredData, scores) } t++ } if (Verbose) {print "---------------Results---------------"} if (Verbose) {describeData("Baseline", baseEstimateData, baseRowToScore) } # printf "Recommendation: " for (col in constraints) { split(constraints[col],tmp,SUBSEP) for (val in tmp) recommendation = recommendation Eman[col]"="tmp[val]" " } # print recommendation"\n" if (Verbose) {describeData("Final", finalEstimateData, finalScores)} baseScoreMedian = findMedian(baseRowToScore) finalScoreMedian = findMedian(finalScores) baseScoreSpread = findSpread(baseRowToScore) finalScoreSpread = findSpread(finalScores) if (finalScoreMedian == 0) scoreMedianReduction = 0 else scoreMedianReduction = 100 * (baseScoreMedian - finalScoreMedian) / baseScoreMedian if (finalScoreSpread == 0) scoreSpreadReduction = 0 else scoreSpreadReduction = 100 * (baseScoreSpread - finalScoreSpread) / baseScoreSpread # keyStr = "Relation.ProjName,ScoreMedian,ScoreSpread" # outStr = "@@@"Relation"."ProjName"."RankedOverride","scoreMedianReduction","scoreSpreadReduction outStr = " score median: "sprintf("%4.0f",scoreMedianReduction)"%" outStr = outStr"\n score spread: "sprintf("%4.0f",scoreSpreadReduction)"%" RankedOverride="" if (Log) { if (NumKlasses > 1) { # printf "score.ASIS."(Note ? Note : Relation)"."ProjName""(RankedOverride ? "."RankedOverride : "")"," >> Log # printf arr2str(baseRowToScore) >> Log # printf "\nscore.TOBE."(Note ? Note : Relation)"."ProjName""(RankedOverride ? "."RankedOverride : "")"," >> Log # printf arr2str(finalScores) >> Log # printf "\n" >> Log } } for(key in Klasses) { split("",tmpBase,"") split("",tmpFinal,"") for(r=1; r<=baseEstimateData[0]; r++) { tmpBase[r] = baseEstimateData[r,Klasses[key]] } for(r=1; r<=finalEstimateData[0]; r++) { tmpFinal[r] = finalEstimateData[r,Klasses[key]] } baseMedian = findMedian(tmpBase) finalMedian = findMedian(tmpFinal) baseSpread = findSpread(tmpBase) finalSpread = findSpread(tmpFinal) if (finalMedian == 0) medianReduction = 0 else medianReduction = 100 * (baseMedian - finalMedian) / baseMedian if (finalSpread == 0) spreadReduction = 0 else spreadReduction = 100 * (baseSpread - finalSpread) / baseSpread # keyStr = keyStr","key"median,"key"spread" # outStr = outStr","medianReduction","spreadReduction outStr = outStr"\n"sprintf("%8s",key)" median: "sprintf("%4.0f",medianReduction)"%" outStr = outStr"\n"sprintf("%8s",key)" spread: "sprintf("%4.0f",spreadReduction)"%" if (Log) { printf key".ASIS."(Note ? Note : Relation)"."ProjName""(RankedOverride ? "."RankedOverride : "")"," >> Log printf arr2str(tmpBase) >> Log printf "\n"key".TOBE."(Note ? Note : Relation)"."ProjName""(RankedOverride ? "."RankedOverride : "")"," >> Log printf arr2str(tmpFinal) >> Log printf "\n" >> Log } } if (Verbose) { print "" print "-----------Reduction Summary-----------" print outStr } print recommendation # } if (!Verbose) {printf "."} } function sortData(data, scores) { for (row in scores) { copy[row] = 0.000001 * rand() + scores[row] serocs[copy[row]] = row } data[0] = newdata[0] = asort(copy) for (row=1; row<=newdata[0]; row++) { for (c=1; c<=Cols; c++) newdata[row SUBSEP c] = data[serocs[copy[row]] SUBSEP c] } for (row=1; row<=newdata[0]; row++) { for (c=1; c<=Cols; c++) data[row SUBSEP c] = newdata[row SUBSEP c] } } function getClassRanges(data, ranges, class,min,max,colnum,row) { for (class in Klasses) { min = Inf max = -1*Inf colnum = Klasses[class] for (row=1; row<=data[0]; row++) { if (data[row SUBSEP colnum] > max) max = data[row SUBSEP colnum] if (data[row SUBSEP colnum] < min) min = data[row SUBSEP colnum] } ranges[colnum SUBSEP "min"]=min ranges[colnum SUBSEP "max"]=max } } function scoreData(data, ranges, scores, numClasses,key,r,score,class,colnum,min,max) { if (!data[0]) "Can't score data with no defined size" numClasses=0 for(key in Klasses) numClasses++ for (r=1; r<=data[0]; r++) { score=0 for (class in Klasses) { colnum = Klasses[class] min = ranges[colnum SUBSEP "min"] max = ranges[colnum SUBSEP "max"] if (max - min == 0) score += 0 else score += (data[r SUBSEP colnum] - min) / (max - min) } scores[r] = score/numClasses #Normalize score to 1.0 } } function describeData(name, data, scores) { strdesc = "" print name" (size: "data[0]")" asort(scores, scoresCopy) print "\tScore-Median: "findMedian(scoresCopy) print "\tScore-Spread: "findSpread(scoresCopy) strdesc = findMedian(scoresCopy)","findSpread(scoresCopy) printf "\tScores: " for (r=1; r<=data[0]; r++) printf "%.3f ",scoresCopy[r] print "" for (class in Klasses) { split("",tmp,"") printf "\t"class": " t=0 for (r=1; r<= data[0]; r++) # tmp[++t] = data[r SUBSEP Klasses[class]] tmp[++t] = data[r SUBSEP 23]#"("data[r SUBSEP Klasses[class]]")" asort(tmp) for (r=1; r<= data[0]; r++) printf tmp[r]" " print "" print "\t"class"-median: "findMedian(tmp) print "\t"class"-spread: "findSpread(tmp) strdesc = strdesc","findMedian(tmp)","findSpread(tmp) } print "" return strdesc } #Add a treatment to the list of constraints function addConstraint(treatment, constraints) { split(treatment,tmp,SUBSEP) attr = tmp[1] val = tmp[2] if(!Seen[attr SUBSEP val]) { if(constraints[attr]) #Have we already constrained this attribute? constraints[attr] = constraints[attr] SUBSEP val #if so, extend its range else constraints[attr] = val } Seen[attr SUBSEP val]=1 } function filter(data, constraints, filteredData, tmp,passesNeeded) { split("",filteredData,"") passesNeeded=0 for (key in constraints) passesNeeded++ for(row=1; row<=data[0]; row++) { #Row passes if all attributes match constraint values #Disjunctions form if multiple constraint values for a single attribute exist passes=0 for(col in constraints) { success=0 split(constraints[col], possibleValues, SUBSEP) for (value in possibleValues) { if (possibleValues[value] == data[row,col]) { success=1 } } if(success) { #Guarantees we can't somehow match multiple times in a range for a single attr passes++ } } if (passes > passesNeeded || passes < 0) #Sanity check print "WHAT THE HELL? HOW DID YOU MATCH MORE CONSTRAINTS THAN THERE ARE CONSTRAINTS?" if(passes == passesNeeded) { #All constraints matched filteredData[0]++ #Ensure monotonic increasing order in filtered set indexes for(c = 1; c <= Cols; c++) { filteredData[filteredData[0],c] = data[row,c] #Add to filtered set } } } } function selects(k1,data,rankeds,ranked, constraints,n,selected,previous,isOptimal,seen) { print "K1:"k1 # saya(data,"data") print "rankeds:"rankeds # saya(ranked,"ranked") RR = rankeds ##BAD for(n=1;n<=rankeds && (!isOptimal || !AutoStop);n++) { if (!seen[ranked[n]]) { seen[ranked[n]] = 1 split(ranked[n], attrange,_); attr = Eman[attrange[1]] range = attrange[2] Query = (Query ? Query" "attr"="range : attr"="range) copya(selected,previous) isOptimal = addNextConstraint(k1,n,data,ranked[n],constraints,selected,previous) } } } function addNextConstraint(k1,n,data,constraint,constraints,selected,previous,isOptimal) { extendConstraint(constraint, constraints) split("",selected,"") selectRows(data,constraints,selected) isOptimal = anyBetter(data,selected,previous,n) if (!isOptimal || !AutoStop) report(k1,n,data,constraint, constraints, selected) return isOptimal } function anyBetter(data,selected,previous,n, key,curScore,prevScore,curCount,prevCount,stop,m,i,r,rowKlasses,oldKlasses) { for(key in selected) { # split("",rowKlasses,"") # getRowClassVals(key, data, Klasses, rowKlasses) # curScore[key] = score(rowKlasses) curScore[key] = scoreRow(key, data, Klasses) curCount++ } for(key in previous) { # split("",oldKlasses,"") # getRowClassVals(key, data, Klasses, oldKlasses) # prevScore[key] = score(oldKlasses) prevScore[key] = scoreRow(key, data, Klasses) prevCount++ } if (AutoStop == 2) { ##stop when we have 5 or fewer things if (curCount < 5) stop = 1 } if (AutoStop == 1) { ##if median or spread improves and we have >2 items if (curCount < 3) stop = 1 if (findMedian(prevScore) > 0) { if (findMedian(curScore) >= findMedian(prevScore) && findSpread(curScore) >= findSpread(prevScore)) stop = 1 } } if (n == RR) #hack to stop when we run out of constraints. stop = 1 if(stop && Report) { if (findMedian(prevScore)<=0) print "No matches on first constraint" else { print Relation" "ProjName" ("Samples" Samples)" print "W misunderestimates the following (Q_i*): "Query print "" print "Goal Scores normalized from 0-100 (lower is better)" m = asort(Baseline) printf "Baseline: "m"{ " for(i=1; i<=m; i++) { if (i%10 == 1 && i != 1) printf "\n " printf "%.2f ",Baseline[i]*100 } print "}" m = asort(prevScore) printf "Final: %2d{ ",m for(i=1; i<=m; i++) { if (i%10 == 1 && i != 1) printf "\n " printf "%.2f ",prevScore[i]*100 } print "}" print "" print " 25% 50% 75%" printf "Baseline: %2.2f %2.2f %2.2f\n",100*find25(Baseline),100*findMedian(Baseline),100*find75(Baseline) printf " Final: %2.2f %2.2f %2.2f\n",100*find25(prevScore),100*findMedian(prevScore),100*find75(prevScore) print "" printReductionStats(Baseline,prevScore) print"------------------------------------\n" } } return stop } function extendConstraint(constraint,constraints, attrange,attr,range) { split(constraint, attrange,_); attr = attrange[1] range = attrange[2] if (attr in constraints) constraints[attr] = "(" range "|" substr(constraints[attr],2) else { constraints[attr] = "(" range ")" } # if (SelectsDebug) # saya(constraints,"constrants") } function selectRows(data,constraints,selected, row) { for(row=1;row<=data[0];row++) if ( selectRow(data,row,constraints) ) selected[row]=1 } function selectRow(data,row,constraints, col) { # saya(data,"data") # saya(constraints,"cons") for(col in constraints) if ( constraints[col] !~ data[row,col] ) ############# return 0 return 1 } function report(k1,n,data,constraint, constraints, selected, \ all,attr,range,attrange,row,tmp,i,str,sep,j,max,sorted,rowKlasses) { split(constraint, attrange,_); attr = Eman[attrange[1]] range = attrange[2] ##BAD Chose = Chose attr" " ##/BAD for(row in selected) { all++ # split("",rowKlasses,"") # getRowClassVals(row, data, Klasses, rowKlasses) # tmp[++i] = score(rowKlasses) tmp[++i] = scoreRow(row, data, Klasses) } max=asort(tmp,sorted) for(j=1;j<=max;j++) { str = str sep sorted[j] sep=" " } # printf(n==1 ? " " : "and ") # printf("n="n ": " attr "= " constraints[attrange[1]] " :\t\t ") # printf find25(sorted) "\t" findMedian(sorted) "\t" find75(sorted) "\t" ##begin ugly code## #store for later if (max > 0) { Previous[0] = max for (k=1; k 0) # printf "Better" # else # printf "Worse" # printf "\t" } ##end ugly code## # printf max # print (all <= 30) ? "*{" str "}" : "*{..}" }