BEGIN { # command-line options
		Samples = 50
		K1		= 5
		K2 		= 15
		Seed	= 1
		Tests   = 0.33
		Nomograms=0.66
		AutoStop = 1
		Report = 0
		OutFile = "log.txt"
}
BEGIN { # internal options
		OFS=","
		IGNORECASE=1 
		Inf = 10^32
		_ = SUBSEP
		CONVFMT="%.8g"
}

function main() { 
	worker(Samples,K1,K2) 
}
function worker(samples, k1,k2,        rankeds, ranked) {
	print "samples         : " samples 
	print "k1              : " k1 
	print "k2              : " k2
	print "%test           : " Tests*100 ""
	printf "Contrast Algo   : "
	print Nomograms ? "Nomograms" : "B-Squared"
	print ""

	print "Training results on " Train[0] " historical examples (what looks useful):"
	rankeds = train(samples,k1,k2,ranked)

	print "Test results on " Test[0] " new projects (applying the training results to new data):\n"
	test( samples,k1,k2,rankeds,ranked) 
}

###############################
## Training: Rank Attributes ##
###############################
function train(samples,k1,k2,ranked,      \
			 projects, neighbors, memos, best, rest,\
			 knearest,rankeds) {
	#       inputs                              outputs
	projects(Train,samples,                     projects)         # example1 projects
	neighbors(samples,projects,Train[0],Train,  neighbors,memos)  # distances example1 to Train set
	knn(k1+k2,samples,neighbors,memos,          knearest)         # knearest Train instance row numbers to example1 projects
	bestRest(knearest,k1,                       best,rest)        # divide knearest into best/worst 
	rankeds = rank(k1,k2,best,rest,             ranked)           # contrast set between best/worst
	return rankeds
}

####################################
## Testing: Apply Best Attributes ##
####################################
function test(samples,k1,k2,rankeds,ranked,    \
			          i,projects,neighbors,memos,knearest,\
					  m,n,sorted,kloc,row,col,data) {
	Median = Inf
	Spread = Inf
	

	for(run=0; run<=rankeds && !Stop; run++) {
	
	split("",projects,"")
	split("",neighbors,"")
	split("",memos,"")
	split("",knearest,"")
	split("",sorted,"")
	split("",kloc,"")

	if (run>1) {
		split(ranked[run],tmp,SUBSEP)
		constraint = tmp[1]
		cval = tmp[2]
		for(j=1;j<=Test["range",constraint,0];j++) {
			delete Test["range",constraint,j]
		}
		Test["range",constraint,0] = 1
		Test["range",constraint,1] = cval
	}	

	projects(Test,samples,                      projects)         # different example2 projects
	neighbors(samples,projects,Test[0],Test,    neighbors,memos)  # distances example2 to Test set
	knn(k1+k2,samples,neighbors,memos,          knearest)         # knearest Test instances row numbers to example2 projects
	for(row=1;row<=Test[0];row++)         
		if (row in knearest) {
			data[0]++

			split("",rowKlasses,"")
			for (k in Klasses) {
				rowKlasses[++r] = Test[row,Klasses[k]]
			}

			kloc[++n]= int(score(rowKlasses))
			for(col=1;col<=Cols;col++)
				data[data[0],col]=Test[row,col]               # convert row numbers to their data rows
		}
	m=asort(kloc,sorted)                                          # report baseline distributions

	if (run == 0) {
		baseMedian = findMedian(sorted)
		baseSpread = find75(sorted) - find25(sorted)
	}
	

	prevSpread = Spread
	prevMedian = Median

	Spread = find75(sorted) - find25(sorted)
	Median = findMedian(sorted)

	if (Median >= prevMedian && Spread >= prevSpread && run > 1)
		Stop = 1

#	print "Baseline (estimates without any project changes): " 
#	print "\t\t\t\t25%\t50%\t75%"
#		if (!Stop) {
			printf "Query "run":\t\t"find25(sorted) "\t" findMedian(sorted) "\t" find75(sorted)"\t\t20*{"

			for(i=1;i<=m;i++)
				printf("%s ", sorted[i])
			print "}"
#		}
	}

	print "@@@"Relation"."ProjName","(baseMedian - prevMedian)/baseMedian","(baseSpread-prevSpread)/baseSpread
#	print "MedianReduction: " ((baseMedian - prevMedian) / baseMedian) * 100

#	saya(Test,"test")
	
#	split("",Previous,"")
#	split("",Baseline,"")
#	Previous[0] = m
#	for (k=1; k<=Previous[0]; k++) {
#		Previous[k] = sorted[k]
#		Baseline[k] = sorted[k]
#	}


#	print "\nResults of applying the  top n-th ranges found during training\n"
#	selects(k1,data,rankeds,ranked)                               # try the tricks found during training on the knearest Test instances
}

#######################################
## Parse Project and Historical Data ##
#######################################
              { gsub(/%.*/,"") }
/^[	\t]$/     { next }
/^@project/   { In = 0 }
In            { rand() <= Tests ? cells(Test,Cols) : cells(Train,Cols) }
/^@relation/  { Relation=$2 }
/^@attribute/ { def($2) }
/^@class/     { defclass($2) }
/^@data/      { In = 1; inits(Cols) }
/^@/          { next }

function inits(cols,  i) {
#	Klass = Klass < 0 ? cols + Klass + 1 : Klass 
	srand(Seed ? Seed : 1) 
	for(i=1;i<=cols;i++) { Train["max",i]= -1*Inf; Train["min",i]=Inf }
	for(i=1;i<=cols;i++) { Test[ "max",i]= -1*Inf; Test[ "min",i]=Inf }
}
function def(name,  a,i,goalp) {
	goalp  = sub(/?/,"",name)
	if (name in Name)  {
		a = Name[name]
	} else {
		a = Name[name] = ++Cols
		Eman[Cols]=name
	} 
	if (Train["range",a,0])
		
	clearStack(Train, "range" _ a) 
	clearStack(Test, "range" _ a) 

	for(i=3;i<=NF;i++) {
		Train["range",a, ++Train["range",a,0]] = $i
		Test[ "range",a, ++Test[ "range",a,0]] = $i
	}
	if (goalp) Goal[a]=1
}	
function defclass(name) {
	if (name in Name)  {
		a = Name[name]
	} else {
		a = Name[name] = ++Cols
		Eman[Cols]=name
	}
	if (Train["range",a,0])
		
	clearStack(Train, "range" _ a) 
	clearStack(Test, "range" _ a) 

	for(i=3;i<=NF;i++) {
		Train["range",a, ++Train["range",a,0]] = $i
		Test[ "range",a, ++Test[ "range",a,0]] = $i
	}
	Klasses[name] = Name[name]
}
function clearStack(a, key,    i, max) {
	if (max = a[ key _   0 ])
		for(i=1;i<=max;i++)
			delete a[ key _ i ]
	a[key _ 0] = 0
}
function cells(data,cols,      col) {
	data[0]++
	for(col=1;col<=cols;col++)  {
		data[data[0],col] = $col
		data["max",col]   = max(data["max",col],$col)
		data["min",col]   = min(data["min",col],$col)
	}
}