#!/sw/bin/gawk -f 

##########################################################################
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU Lesser General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU Lesser General Public License for more details.
#
#    You should have received a copy of the GNU Lesser General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
##########################################################################

# which2 : a stochastic anytime rule learner 
# (c) Tim Menzies (tim@menzies.us) 2010, LGLP 3.0 

# This program builds rules by ranking ideas, then repeatedly building new ideas
# by picking # and combining two old ideas (favoring those with higher ranks).  New
# ideas generated in this way are ranked and thrown back into the same pot as
# the old ideas so, if they are any good, they might be picked and extended
# in subsequent rounds. Alternatively, if the new idea stinks, it gets buried
# by the better ideas and is ignore.

# One important aspect of the following is that the scoring routines for
# ideas are completely seperate from the rest of the code (see the "score1"
# function). Hence, it is a simple # matter to try our different search biases.

# e.g. This call produces the following output.
# gawk -f which2.awk titanic.arff 

# In the following, the "candidates" are ideas that look promsing
# and "score" ranks the candidates. If the max score does not improve
# from the last round, then "lives" decreases.

# Each round tries random combinations of the stuff from prior rounds 
# (favoring those things with higher scores). Hence, at round 1,
# all the candidates are singletons. But. later on (see line 54)
# the candidates can grow to combinations of things.

# Each round prunes the candiates so that only the better candiadtes
# surive to round+1.

# The final output are the candidates of the last round.

# ----------------------------------------------------
# % class: yes seed: 1 round: 1 max: 0 lives: 5
# 
# % candidate: yes,age=child
# % candidate: yes,age=child,class=1st
# % candidate: yes,age=child,sex=female
# % candidate: yes,class=1st
# % candidate: yes,class=1st,class=2nd
# % candidate: yes,class=1st,sex=female
# % candidate: yes,class=2nd
# % candidate: yes,class=2nd,sex=female
# % candidate: yes,sex=female
# 
# [0.022196665,yes,class=1st,sex=female].
# [0.030848353,yes,class=2nd].
# [0.05809188,yes,class=1st].
# [0.12507705,yes,sex=female].
# [0.13519168,yes,class=1st,class=2nd].
# 
# ----------------------------------------------------
# % class: yes seed: 1 round: 2 max: 0.13519168 lives: 5
# 
# % candidate: yes,class=1st
# % candidate: yes,class=1st,class=2nd
# % candidate: yes,class=1st,class=2nd,sex=female
# % candidate: yes,class=1st,sex=female
# % candidate: yes,class=2nd
# % candidate: yes,class=2nd,sex=female
# % candidate: yes,sex=female
# 
# [0.022196665,yes,class=1st,sex=female].
# [0.030848353,yes,class=2nd].
# [0.056260989,yes,class=1st,class=2nd,sex=female].
# [0.05809188,yes,class=1st].
# [0.12507705,yes,sex=female].
# [0.13519168,yes,class=1st,class=2nd].
# 
# ----------------------------------------------------
# % class: yes seed: 1 round: 3 max: 0.13519168 lives: 4
# 
# % candidate: yes,class=1st
# % candidate: yes,class=1st,class=2nd
# % candidate: yes,class=1st,class=2nd,sex=female
# % candidate: yes,class=1st,sex=female
# % candidate: yes,class=2nd
# % candidate: yes,class=2nd,sex=female
# % candidate: yes,sex=female
# 
# [0.022196665,yes,class=1st,sex=female].
# [0.030848353,yes,class=2nd].
# [0.056260989,yes,class=1st,class=2nd,sex=female].
# [0.05809188,yes,class=1st].
# [0.12507705,yes,sex=female].
# [0.13519168,yes,class=1st,class=2nd].
# 
# ----------------------------------------------------
# % class: yes seed: 1 round: 4 max: 0.13519168 lives: 3
# 
# % candidate: yes,class=1st
# % candidate: yes,class=1st,class=2nd
# % candidate: yes,class=1st,class=2nd,sex=female
# % candidate: yes,class=1st,sex=female
# % candidate: yes,class=2nd
# % candidate: yes,class=2nd,sex=female
# % candidate: yes,sex=female
# 
# [0.022196665,yes,class=1st,sex=female].
# [0.030848353,yes,class=2nd].
# [0.056260989,yes,class=1st,class=2nd,sex=female].
# [0.05809188,yes,class=1st].
# [0.12507705,yes,sex=female].
# [0.13519168,yes,class=1st,class=2nd].
# 
# ----------------------------------------------------
# % class: yes seed: 1 round: 5 max: 0.13519168 lives: 2
# 
# % candidate: yes,class=1st
# % candidate: yes,class=1st,class=2nd
# % candidate: yes,class=1st,class=2nd,sex=female
# % candidate: yes,class=1st,sex=female
# % candidate: yes,class=2nd
# % candidate: yes,class=2nd,sex=female
# % candidate: yes,sex=female
# 
# [0.022196665,yes,class=1st,sex=female].
# [0.030848353,yes,class=2nd].
# [0.056260989,yes,class=1st,class=2nd,sex=female].
# [0.05809188,yes,class=1st].
# [0.12507705,yes,sex=female].
# [0.13519168,yes,class=1st,class=2nd].
# 
# ----------------------------------------------------
# % class: yes seed: 1 round: 6 max: 0.13519168 lives: 1
# 
# % candidate: yes,class=1st
# % candidate: yes,class=1st,class=2nd
# % candidate: yes,class=1st,class=2nd,sex=female
# % candidate: yes,class=1st,sex=female
# % candidate: yes,class=2nd
# % candidate: yes,class=2nd,sex=female
# % candidate: yes,sex=female
# 
# [0.022196665,yes,class=1st,sex=female].
# [0.030848353,yes,class=2nd].
# [0.056260989,yes,class=1st,class=2nd,sex=female].
# [0.05809188,yes,class=1st].
# [0.12507705,yes,sex=female].
# [0.13519168,yes,class=1st,class=2nd].
# 
# ----------------------------------------------------
# % class: yes seed: 1 round: 7 max: 0.13519168 lives: 0
# 
# % candidate: yes,class=1st
# % candidate: yes,class=1st,class=2nd
# % candidate: yes,class=1st,class=2nd,sex=female
# % candidate: yes,class=1st,sex=female
# % candidate: yes,class=2nd
# % candidate: yes,sex=female
# 
# [0.022196665,yes,class=1st,sex=female].
# [0.030848353,yes,class=2nd].
# [0.056260989,yes,class=1st,class=2nd,sex=female].
# [0.05809188,yes,class=1st].
# [0.12507705,yes,sex=female].
# [0.13519168,yes,class=1st,class=2nd].

BEGIN  {
    Beam=10;        # Only the top (say) 10 candidates survive to the next round
    Dull=0.1;       # Ignore candidates with score < Dull*MaxScore
	Eval1=5;
	Eval2=1;
	Inf=10^32;
    Lives=5;        # If no improvement after five rounds, give up
	Max=1           # If (1 or 0) try to (max or min) the score
    More = 1.02;    # Improvement means at least a 2% growth
	OverFitted=3;   # When do we prune a rule that matches on too few instances?
    Pinch = 1/1000; # Add a random number of up to "Pinch" to each score
    Samples=20;     # Pick this number of pairs of candidates from the last round
	Seed=1          # Random number see.
	Skip = 0.33
    Verbose=1       # Verbose = 0 means silence
    CONVFMT="%.8g"; # Increase the string size for array contents so we can see the Pinch
    IGNORECASE=1; 
    SUBSEP = "=";
    _ = SUBSEP
	OFS=" _ ";
    C=","
}

## --------------------------------------------------
#Data entry. Pretty routine stuff. 
/@attribute/ {Name[++Name[0]]=$2; Name[$2] = Name[0]}
             {gsub(/[ \t]*/,"")} # no blanks 
             {gsub(/%.*/,"")}    # no comments
/^$/         {next}              # no blank likes
/@data/      {In=1;FS=","; srand(Seed)}
/@/          {next}
In           {
			  Bin = rand() > Skip
              Rows[Bin]++; 
              train(Bin,Rows[Bin],Data,U) }
END          {learn(1,Rows,Data,U) }

function train(bin,row,d,u,   what,i) {
	if (! Max)
		$NF *= -1
    for(i=1;i<=NF;i++)  {
        if ($i == "?")       # skip missing values
            continue;
        what = Name[i]       # what is the name of column i?
        d[bin,row,what]=$i   # raw data storage
		u[what,$i]=1         # remember all the unique values
	}
}

# Now we can begin. Try learning rules for each hypothesis.
function learn(bin,rows,data,u,      r,class) {
	ranges(u,r)
	round0(bin,counts,rows,data,which0)
	oo(which0,"which0learn","-n -k 5")
    rounds(1,(Max ? -1* Inf : Inf) ,Lives,which0,rows,data,which)
}
for(class in k) 
        learn1(bin,class,all,h,rows,data,f,u,r)
}
# In round0, offer a rough ranking of
# the ranges. In subequent rounds, randomly select and combine ranges
# from prior randoms.
function learn1(bin,class,all,h,rows,data,f,u,r,   which0,which1) {
	print "\n" class
	Eval=Eval1;
	if (Verbose) print ""
    round0(bin,class,all,rows,f,u,h,which0); # make some initial guess
    rounds(bin,class,1,0,Lives,which0,rows,data,f,r,u,which1)
	#o(which1,"which1")
	Eval=Eval2;
	test(bin,class,which1,rows,data,f)
}
function ranges(u,r,  i,tmp) {
	for(i in u) {
		split(i,tmp,_)
		r[tmp[1]]++
	}
}
# In round one, score by b^2/(b+r); i.e. things more likely in
# the target class than otherwise (with some support weighting)
function round0(bin,class,all,rows,f,u,h,which,  some,i,j,b,r,s,memo,score) {
    for(i in u) {
        some = f[bin _ class _ i]
        r    = (all[bin _ i] - some)/(rows[bin]  - h[bin _ class] + Pinch)
        b    = some / (h[bin _ class] + Pinch)
		#print "i " i " some " some " r " r " b " b
        if (b > r) {
            j = class "," i
            s = b^2/(b+r) + rand()*Pinch
            memo[s] = j
            score[j]= s 
        }}
   chop(score,memo,which) # prune the dull candidates
}

# Given some score[key]=number and memo[number]=key,
# sort the scores and return the top Beam
# number of keys, pruning all keys less than
# Dull times the max score.
function chop(score0,memo,out,   score,n,i) {
    n=asort(score0,score)
    for(i=n;i>=1;i--) {
        if (score[i] <= score[n]*Dull)
            break;
        if (i <= n - Beam)
            break
        out[memo[score[i]]] = score[i]
    }
}

# In subsequent rounds one, score all the candidates
# by running that combination over the data (see the "score"
# function. Note the "score" paramter that caches prior
# calcuations of the score. This speeds up the code by
# a factor of four (for large data sets).
function rounds(bin,class,round, max0,lives,which0,rows,data,f,r,u,out,score,   \
                ideas,max,i,sample,which1,s,memo,which2) {
    if (round == 1)
        max=0
    else { # terminate if we have stopped improving
        max = most(which0)    
        lives =  (max > (max0*More)) ? Lives : lives - 1
        if(lives < 0) { # if termination, copy input to out
            for(i in which0)
                out[i] = which0[i]
            return max }
    }
	if (Verbose) {
       print "% bin " bin " class: " class " seed: " Seed " verbose: " Verbose\
              " round: " round " max: " max " lives: " lives 
	}
    normalize(which0)           # make all the counts n= 1..100
    explode(which0,sample)      # copy items n times
    twos(class,r,sample,Samples,which1) # pick items at random from that sample
    for(i in which0)            # add in the last rounds' ideas
        which1[i] = i;
	beliefsInject(class,which1)
    if (Verbose > 1) 
		values(which1,"candidate")
    for(i in which1) {          # score the new picks and the last rounds's ideas
        s = (i in score) ? score[i] : score(bin,class,i,rows,data,f) + rand()*Pinch
        memo[s] = i
        score[i] = s
        }
	if (Verbose > 1) 
		beliefsReview(class,score)
	chop(score,memo,which2)     # prune the dull candidates
    if (Verbose > 1)  
		o(which2,"scores","-n -k 1") 
    return rounds(bin,class,round+1,max,lives,which2,rows,data,f,u,r,out,score)
}
function beliefsInject(class, which,   ideas,i) {
	split(Beliefs,ideas,";")
	for(i in ideas)
		which[class "," ideas[i]] = class "," ideas[i]
}
function beliefsReview(class, score,   ideas,i) {
	split(Beliefs,ideas,";")
	for(i in ideas)
		print "##score " ideas[i] " " score[class "," ideas[i]]
}
## -----------------------------------------
# Randomly pick pairs and combine them. Note that,
# in the following code, the picks come from "sample"
# where an item may be repeated many times (so things
# that are often repeated are more likely to be picked).

# "n" times, pick two things from "sample"
# and store them in "sampled". note htat
# the combined rules all start with the target class
function twos(class,r,sample,n,sampled,  pair) {
    while(n--) {
        pair= two(class,r,sample)
        sampled[pair]=pair
        }
}

# Pick two things at random. Try not
# to pick the same thing twice. Return
# the combination of the two things you pick.
function two(class,r,sample, tries, this, that,new) {
    this = one(sample)
    if(tries == 9) # nine lives
        return this
    that = one(sample)
    if (this == that)
        return two(class,r,sample,tries + 1)
	else {
		new = combine(class,r,this,that)
		return new ? new : two(class,r,sample,tries + 1)
	}
}

# combine two rules. don't repeat any ranges.
# sort them so that all the ranges of the same
# feature fall together. Note that the frst item in
# a rule is the target class. prune those entries
# away (so they don not repeat themselves).
function combine(class,r,this,that,    k,v,r0,n,i,used,tmp,tests,pair,ignore,out) {
    sub(/^[^,]*,/,"",this)   # get rid of initial class symbol
    sub(/^[^,]*,/,"",that)   #   (its just repeated info anyway)
    split(this "," that,tmp,",")
    n=asort(tmp)            # sort the combined attr=val pairs
	                        # (this makes subsequent equality tests easy- 
	                        #  just use "==")
    for(i=1;i<=n;i++)
        if (!used[tmp[i]]) { # never use the same pair twice
            used[tmp[i]] = 1 # remember that we've used this pair.
			split(tmp[i],pair,_) # the string "attr=val" is a pair
			k=pair[1]            #      with key "k"
			v=pair[2]            #      and value "v"
            push(tests,k)
			push(tests,v)
			r0[k]++         # remember how many times we've seen r0
			if( r0[k] >= r[k] ) # if we've seen it too often
				ignore[k] = 1       # then mark it "ignore"d
        }
	for(i=1; i<=tests[0]; i += 2) # build the rule
		if (!ignore[tests[i]])   # skip the ones we are ignoring
			out = concat(out, tests[i] _ tests[i+1], C) 	
	return out ? class "," out : ""
}

## ---------------------------------
## score a rule by finding its matching rows in data.
# test a rule by scoring it 
function test(bin,class,rules,rows,data,f, com) {
	com="sort -t _ -n -k 2 #" rand()
	for(rule in rules)
		if (rule)
			print rule,rules[rule],score(0,class,rule,rows,data,f) | com
	close(com)
}
function score(bin,class,rule,rows,data,f,  
			 goal,row, col,a,b,c,d,triggered,pd,pf,prec,acc,support,s,fits) {
    s=a=b=c=d=Pinch # stop divide by zero errors
    goal=Name[Name[0]]
    for(row=1;row<=rows[bin];row++) {
        triggered = matched(row,bin,data,rule)
        if (data[bin,row,goal] == class) {
            if (triggered) {d++} else {b++}
        } else {
            if (triggered) {c++} else {a++}
        }
    } 
    fits    = c + d
    pd      = d/(b+d)
    pf      = a/(a+c)
    prec    = d/(c+d)
    acc     = (a+d)/(a+b+c+d)
	lift    = (d/(c+d)) / ((b+d)/(a+b+c+d ))
    support = (c+d)/(a+b+c+d)
    s = score1(pd,pf,prec,acc,support,lift,fits)
	return s
}
function score1(pd,pf,prec,acc,support,lift,fits) {
    if (fits <= OverFitted)
       return 0
    if (Eval==1) return acc
    if (Eval==2) return 2 * pd * prec/(pd+prec)
    if (Eval==3) return 2 * pd * pf/(pd+pf)
    if (Eval==4) return support * 2 * pd * pf/(pd+pf)
    if (Eval==5) return support * acc
    if (Eval==6) return support * 2 * pd * prec/(pd+prec)
	if (Eval==7) return lift
	if (Eval==8) return lift*support  # which btw, is recall
    return support * 2 * pd * prec/(pd+prec)
}

# Given "this" of the form "f1_v1,f2_v2,...." see if "row" matches "this".
# Assumes that disjunctions are modeled as  configuous values from the
# same feature (this is gaurenteed by "combine"). Hence, whenever
# we move to a new feature, we need to check that at least of the values
# mentioned with the old feature was found.
function matched(row,bin,data,this,    col,n,goals,pair,f0,f,status) {
    n=split(this,goals,",")
    for(col=2;col<=n;col++) {
        split(goals[col],pair,_)
        f = pair[1]
        status[f] += data[bin,row,f] == pair[2]
        if (f0 && (f != f0) && !status[f0]) 
            return 0
        f0 = f
    }
    return status[f]
}

## --------------------------------------
## interesting utils

# Given an array a[i]=n,
# fill up "out" with "n" number
# of "i". This creates a "sample" of
# things, from which we can pick randomly
# biased by the relative frequencies of "n".
# The total size of the sample is stored
# in "sample[0]"
function explode(a, out,i,j) {
    for(i in a)
        for(j=1;j<=a[i];j++)
            out[++out[0]] = i
}

# Pick any item at random from "sample".
# Assumes that the same size is in array
# element "sample[0]"
function one(sample, any) {
    any = int(rand()* sample[0])+1
    return sample[any]
}

## --------------------------------------
## boring utils

# Given an array a[i]=num, normalize
# all the numbers to integers 0..100
function normalize(a, i,sum) {
    for(i in a) sum += a[i]
    for(i in a) a[i] = int(100*a[i]/sum)
}

# combine an feature/ range
function fv(f,v) { return f _ v }

# find the max item in an array
function most(a,   i,max) {
    max = -1000000000
    for(i in a)
        if (a[i] > max)
            max = a[i];
    return max
}
# push
function push(a,x) {
	return a[++a[0]]=x
}
# string append, with seperator
function concat(s1,s2,sep) {
	return  s1 ? s1 sep s2 : s2
}
# print array values
function values(a,s,what,  i,com) {
	print ""
    com = what ? "sort " what : "sort"
    for(i in a) 
        print "% " s": " a[i] | com;
    close(com)
 }	
# print an array, sorted by "what"
function o(a,s,what,   i,com) {
	print ""
    com = what ? "sort -t, " what : "sort -t, "
    for(i in a) 
        print s"["a[i] ","i"]." | com;
    close(com)
 } 
function oo(a,s,  i){
	print ""
    for(i in a) 
        print s"[" i "] = " a[i]
}