# no more From To. need to pull at random

BEGIN { # command-line options
		Samples = 20
		K1		= 5
		K2 		= 15
		Seed	= 1
		Klass	= -1
		Tests   = 0.33
}
BEGIN { # internal options
		OFS=","
		IGNORECASE=1 
		Inf = 10^32
		_ = SUBSEP
		CONVFMT="%.8g"
}

##################################################################
# main program

function main() { 
	worker(Samples,K1,K2) 
}
function worker(samples, k1,k2,        rankeds, ranked) {
	print "samples         : " samples 
	print "k1              : " k1 
	print "k2              : " k2
	print "%test           : " Tests*100 "\n"
	print "Training results on " Train[0] " historical examples (what looks useful):"
	rankeds = train(samples,k1,k2,ranked)
	saya(ranked,"ranked")
	print "Test results on " Test[0] " new projects (applying the training results to new data):\n"
	test( samples,k1,k2,rankeds,ranked) 
}
function train(samples,k1,k2,ranked,      \
			 projects, neighbors, memos, best, rest,\
             knearest,rankeds) {
	#       inputs                              outputs
    #       ------                              -------
	projects(Train,samples,                     projects)         # example1 projects
	neighbors(samples,projects,Train[0],Train,  neighbors,memos)  # distances example1 to Train set
	knn(k1+k2,samples,neighbors,memos,          knearest)         # knearest Train instance row numbers to example1 projects
	bestRest(knearest,k1,                       best,rest)        # divide knearest into best/worst 
	rankeds = rank(k1,k2,best,rest,             ranked)           # contrast set between best/worst
	return rankeds
}
function test(samples,k1,k2,rankeds,ranked,    \
			          i,projects,neighbors,memos,knearest,\
					  m,n,sorted,kloc,row,col,data) {
	projects(Test,samples,                      projects)         # different example2 projects
	neighbors(samples,projects,Test[0],Test,    neighbors,memos)  # distances example2 to Test set
	knn(k1+k2,samples,neighbors,memos,          knearest)         # knearest Test instances row numbers to example2 projects
	for(row=1;row<=Test[0];row++)         
		if (row in knearest) {
			data[0]++
			kloc[++n]= int(Test[row,Klass])
			for(col=1;col<=Cols;col++)
				data[data[0],col]=Test[row,col]               # convert row numbers to their data rows
	}
	m=asort(kloc,sorted)                                          # report baseline distributions
	print "Baseline (estimates without any project changes): " 
	for(i=1;i<=m;i++)
		printf("%s ", sorted[i])
	print "\n\nResults of applying the  top n-th ranges found during training\n"
	selects(k1,data,rankeds,ranked)                               # try the tricks found during training on the knearest Test instances
}
##################################################################
# read in data

              { gsub(/%.*/,"") }
/^[	\t]$/     { next }
/^@project/   { In = 0 }
In            { rand() <= Tests ? cells(Test,Cols) : cells(Train,Cols) }
/^@relation/  { Relation=$2 }
/^@attribute/ { def($2) }
/^@data/      { In = 1; inits(Cols) }
/^@/          { next }

function inits(cols,  i) {
	Klass = Klass < 0 ? cols + Klass + 1 : Klass 
	srand(Seed ? Seed : 1) 
	for(i=1;i<=cols;i++) { Train["max",i]= -1*Inf; Train["min",i]=Inf }
	for(i=1;i<=cols;i++) { Test[ "max",i]= -1*Inf; Test[ "min",i]=Inf }
}
function def(name,  a,i,goalp) {
	goalp  = sub(/?/,"",name)
	if (name in Name)  {
		a = Name[name]
	} else {
		a = Name[name] = ++Cols
		Eman[Cols]=name
	} 
	if (Train["range",a,0])
		
	clearStack(Train, "range" _ a) 
	clearStack(Test, "range" _ a) 

	for(i=3;i<=NF;i++) {
		Train["range",a, ++Train["range",a,0]] = $i
		Test[ "range",a, ++Test[ "range",a,0]] = $i
	}
	if (goalp) Goal[a]=1
}	
function clearStack(a, key,    i, max) {
	if (max = a[ key _   0 ])
		for(i=1;i<=max;i++)
			delete a[ key _ i ]
	a[key _ 0] = 0
}
function cells(data,cols,      col) {
	data[0]++
	for(col=1;col<=cols;col++)  {
		data[data[0],col] = $col
		data["max",col]   = max(data["max",col],$col)
		data["min",col]   = min(data["min",col],$col)
	}
}
##################################################################
# generate projects

function projects(data,news,new,  row,col,v) { 
	new[0]=news
	for(col=1;col<=Cols;col++)  {
		new["max",col]= -1*Inf
		new["min",col]= Inf
		for(row=1;row<=news;row++)  {
			v = projectValue(data,col)
			new[row,col]   = v
			new["max",col] = max(new["max",col],v)
			new["min",col] = min(new["min",col],v)
		}
	}
}
function projectValue(data,a,   max,one) {
	max = data["range",a,0]
	one = int(rand() * max) + 1
	return data["range",a,one]
}

##################################################################
# find the k-th nearest historial projects near the generated projects

function euclidean(row1,row2,data1,data2,    n,col,d,d1,d2) {
	for(col=1;col<=Cols;col++)
		if (col != Klass) {
			d1 = normalize(data1,col,data1[row1,col])
			d2 = normalize(data2,col,data2[row2,col])
			d += abs(d1 - d2)^2
			n++
	}
	return  sqrt(d)/sqrt(n)
}
function distance(row1,row2,data1,data2,memo,  d) {
	d = as100(euclidean(row1,row2,data1,data2))
	memo[-1 * d] = row1  # d started at row1
	memo[d]      = row2  # d ended at row2
	return d
}
function normalize(data,col,n,      min,max,d) { 
	min = data["min",col]
	max = data["max",col]
	d = min == max ? 1 : (n - min) /(max - min) 
	#print "n " n " min " min " max " max " d " d
	return d
}
function neighbors(news,new,olds,old,neighbor,memo,   o,n) {
	for(n=1;n<=news;n++) 
		for(o=1;o<=olds;o++) 
			push2(distance(n,o,new,old,memo),neighbor,n)
}
function knn(k,news,neighbor,memo,ks,    dist,n,most,i,d,sorted) {
	for(n=1;n<=news;n++) {
		most = neighbor[n,0]
		for(i = 1;i <=  most;i++)  
			dist[++d] = neighbor[n,i]
	}
	knnDebug(dist,memo)
	asort(dist,sorted)
	for(i=1;i<=d;i++)  {
		n  = memo[sorted[i]]
		if ( ++ks[n]==1 ) k--
		if (k == 0)  return i
    }	
	return k
}
function knnDebug(dist,memo,    com,i) {
	if (KnnDebug)  {
		com = "sort -n -k 2 | " KnnDebug
		for(i in dist)
			print memo[dist[i]] " " dist[i] | com
		close(com)
	}
}

##################################################################
# divide the k-th nearest historial projects into best (lowest)
# estiamted and the rest. collect frequency counts for best and rest.
# rank attribute ranges by how common they are in best and how
# rare they are in rest

function bestRest(rows,border,best,rest,    enough,n,scores,row) {
	for(row in rows) 
		scores[++n]= Train[row,Klass]
	asort(scores)
	enough = scores[border]
	for(row in rows)
		if ( Train[row,Klass] <= enough ) 
			count(row,best,rows[row])
		else  
			count(row,rest,rows[row]) 
}
function count(row,f,n,    col) {
	f[0]++
	for(col in Goal)
		f[col,Train[row,col]] += n
}
function rank(k1,k2,best,rest,ranked,  	\
		      range,bests,rests,i,b,r,score,scores,sorted,memo,max) {
	bests = best[0]
	rests = rest[0]
	for(i in best)  
		if (i != 0) {
			b             = best[i] / bests
			r             = rest[i] / rests
			if(Nomograms) {
				if(best[i] < (bests*Nomograms)) continue
				score         = log(b/(r+0.000001)) + rand()/10000# as100(b^2/(b+r))
			} else { score=as100(b^2(b+r)) }
			scores[i]     = score
			memo[score] = i
			if (RankDebug) 
				print no_(i) "\t" " b=" b " r= " r " score=" score
	}
	max = asort(scores,sorted)
	if (Nomograms) showRanks(memo,sorted,max)
	for(i=max; i>=1;i--)  # highest score must be forst
		ranked[max-i+1] = memo[sorted[i]]
	return max
}	
function showRanks (memo,sorted,max,  i, range,tmp,com) {
	com="sort -r -n | cat -n"
	print "#n\tscore\trange"
	print "------\t-----\t--------------"
	for(i=max; i>=1;i--) { # highest score must be forst
		range = memo[sorted[i]]
		split(range,tmp,_)
		print sprintf("%5.2f",sorted[i]) "\t" Eman[tmp[1]] " = " tmp[2]  | com
	}
	close(com)
	print ""
	
}
#########################################################################
# select and report subset of relevant rows that satisfy constraints 1..n	

function selects(k1,data,rankeds,ranked,			constraints,n) {
	for(n=1;n<=rankeds;n++)
		addNextConstraint(k1,n,data,ranked[n],constraints)
}
function addNextConstraint(k1,n,data,constraint,constraints,   selected) {
	extendConstraint(constraint, constraints)
	selectRows(data,constraints,selected)
	report(k1,n,data,constraint, constraints, selected)
}
function extendConstraint(constraint,constraints,    attrange,attr,range) {
	split(constraint, attrange,_);
	attr  = attrange[1]
	range = attrange[2]	
	if (attr in constraints) 
		constraints[attr] = "(" range "|" substr(constraints[attr],2) 
	else { constraints[attr] = "(" range ")" }
	if (SelectsDebug) 
			saya(constraints,"constrants")
}
function selectRows(data,constraints,selected,   row) {
	for(row=1;row<=data[0];row++)
		if ( selectRow(data,row,constraints) )
			selected[row]=1
}
function selectRow(data,row,constraints,   col) {
	for(col in constraints)
		if ( constraints[col] !~ data[row,col] )
			return 0
	return 1
}
function report(k1,n,data,constraint, constraints, selected,     \
			    all,attr,range,attrange,row,tmp,i,str,sep,j,max,sorted) {
	split(constraint, attrange,_);
	attr  = Eman[attrange[1]]
	range = attrange[2]	
	for(row in selected) {
		all++  
		tmp[++i] = int(data[row,Klass])
	}
	max=asort(tmp,sorted)
	for(j=1;j<=max;j++) {
		str = str sep sorted[j]
		sep=" "
	}
	printf(n==1 ? "    " :  "and ")
	printf("n="n ": " attr "= " constraints[attrange[1]]  " :\t\t " \
			sorted[round(i*0.25)] "\t" sorted[round(i*0.5)] "\t" sorted[round(i*0.75)] "\t" \
			max )
	print  (all <= 30) ? "*{" str "}" : "*{..}"
}

##################################################################
# standard utils

function barph(str)       { print str >"/dev/stderr"; fflush("/dev/stderr") }
function push2(v,a,i)     { a[i,++a[i,0]] = v; return v }
function push(v,a)        { a[++a[0]] = v; return v }
function as100(n)         { return (n*100) + rand()/10 }
function abs(n)           { return n < 0 ? -1* n : n }
function max(n1,n2)       { return n1<n2 ? n2 : n1 }
function min(n1,n2)       { return n1<n2 ? n1 : n2 }
function no_(str)         { gsub(_,",",str); return str }
function round(i)         { return int(i+0.5) }

function saya(a,s,   b,c,m,n,key,val,i,j,tmp,sep) {
	print ""
	m     = asorti(a,b)
	for(i=1;i<=m;i++)  {
		key=b[i]
		val=a[b[i]]
		printf("%s",  sep s "["  )
		n=split(key,tmp,_)
		c = ""
		for(j=1;j<=n;j++)	{	
			printf("%s", c tmp[j]  )
			c=","
		}
		if (val ~ _)
			val = no_(val)
		printf("%s", "]=" val )
		sep="\n";
	};
	print ""
 }