#!/usr/bin/gawk -f 

##########################################################################
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU Lesser General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU Lesser General Public License for more details.
#
#    You should have received a copy of the GNU Lesser General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
##########################################################################

# which2 : a stochastic anytime rule learner 
# (c) Tim Menzies (tim@menzies.us) 2010, LGLP 3.0 

# This program builds rules by ranking ideas, then repeatedly building new ideas
# by picking # and combining two old ideas (favoring those with higher ranks).  New
# ideas generated in this way are ranked and thrown back into the same pot as
# the old ideas so, if they are any good, they might be picked and extended
# in subsequent rounds. Alternatively, if the new idea stinks, it gets buried
# by the better ideas and is ignore.

# One important aspect of the following is that the scoring routines for
# ideas are completely seperate from the rest of the code (see the "score1"
# function). Hence, it is a simple # matter to try our different search biases.

# e.g. This call produces the following output.
# gawk -f which2.awk titanic.arff 

# In the following, the "candidates" are ideas that look promsing
# and "score" ranks the candidates. If the max score does not improve
# from the last round, then "lives" decreases.

# Each round tries random combinations of the stuff from prior rounds 
# (favoring those things with higher scores). Hence, at round 1,
# all the candidates are singletons. But. later on (see line 54)
# the candidates can grow to combinations of things.

# Each round prunes the candiates so that only the better candiadtes
# surive to round+1.

# The final output are the candidates of the last round.

# ----------------------------------------------------
# % class: yes seed: 1 round: 1 max: 0 lives: 5
# 
# % candidate: yes,age=child
# % candidate: yes,age=child,class=1st
# % candidate: yes,age=child,sex=female
# % candidate: yes,class=1st
# % candidate: yes,class=1st,class=2nd
# % candidate: yes,class=1st,sex=female
# % candidate: yes,class=2nd
# % candidate: yes,class=2nd,sex=female
# % candidate: yes,sex=female
# 
# [0.022196665,yes,class=1st,sex=female].
# [0.030848353,yes,class=2nd].
# [0.05809188,yes,class=1st].
# [0.12507705,yes,sex=female].
# [0.13519168,yes,class=1st,class=2nd].
# 
# ----------------------------------------------------
# % class: yes seed: 1 round: 2 max: 0.13519168 lives: 5
# 
# % candidate: yes,class=1st
# % candidate: yes,class=1st,class=2nd
# % candidate: yes,class=1st,class=2nd,sex=female
# % candidate: yes,class=1st,sex=female
# % candidate: yes,class=2nd
# % candidate: yes,class=2nd,sex=female
# % candidate: yes,sex=female
# 
# [0.022196665,yes,class=1st,sex=female].
# [0.030848353,yes,class=2nd].
# [0.056260989,yes,class=1st,class=2nd,sex=female].
# [0.05809188,yes,class=1st].
# [0.12507705,yes,sex=female].
# [0.13519168,yes,class=1st,class=2nd].
# 
# ----------------------------------------------------
# % class: yes seed: 1 round: 3 max: 0.13519168 lives: 4
# 
# % candidate: yes,class=1st
# % candidate: yes,class=1st,class=2nd
# % candidate: yes,class=1st,class=2nd,sex=female
# % candidate: yes,class=1st,sex=female
# % candidate: yes,class=2nd
# % candidate: yes,class=2nd,sex=female
# % candidate: yes,sex=female
# 
# [0.022196665,yes,class=1st,sex=female].
# [0.030848353,yes,class=2nd].
# [0.056260989,yes,class=1st,class=2nd,sex=female].
# [0.05809188,yes,class=1st].
# [0.12507705,yes,sex=female].
# [0.13519168,yes,class=1st,class=2nd].
# 
# ----------------------------------------------------
# % class: yes seed: 1 round: 4 max: 0.13519168 lives: 3
# 
# % candidate: yes,class=1st
# % candidate: yes,class=1st,class=2nd
# % candidate: yes,class=1st,class=2nd,sex=female
# % candidate: yes,class=1st,sex=female
# % candidate: yes,class=2nd
# % candidate: yes,class=2nd,sex=female
# % candidate: yes,sex=female
# 
# [0.022196665,yes,class=1st,sex=female].
# [0.030848353,yes,class=2nd].
# [0.056260989,yes,class=1st,class=2nd,sex=female].
# [0.05809188,yes,class=1st].
# [0.12507705,yes,sex=female].
# [0.13519168,yes,class=1st,class=2nd].
# 
# ----------------------------------------------------
# % class: yes seed: 1 round: 5 max: 0.13519168 lives: 2
# 
# % candidate: yes,class=1st
# % candidate: yes,class=1st,class=2nd
# % candidate: yes,class=1st,class=2nd,sex=female
# % candidate: yes,class=1st,sex=female
# % candidate: yes,class=2nd
# % candidate: yes,class=2nd,sex=female
# % candidate: yes,sex=female
# 
# [0.022196665,yes,class=1st,sex=female].
# [0.030848353,yes,class=2nd].
# [0.056260989,yes,class=1st,class=2nd,sex=female].
# [0.05809188,yes,class=1st].
# [0.12507705,yes,sex=female].
# [0.13519168,yes,class=1st,class=2nd].
# 
# ----------------------------------------------------
# % class: yes seed: 1 round: 6 max: 0.13519168 lives: 1
# 
# % candidate: yes,class=1st
# % candidate: yes,class=1st,class=2nd
# % candidate: yes,class=1st,class=2nd,sex=female
# % candidate: yes,class=1st,sex=female
# % candidate: yes,class=2nd
# % candidate: yes,class=2nd,sex=female
# % candidate: yes,sex=female
# 
# [0.022196665,yes,class=1st,sex=female].
# [0.030848353,yes,class=2nd].
# [0.056260989,yes,class=1st,class=2nd,sex=female].
# [0.05809188,yes,class=1st].
# [0.12507705,yes,sex=female].
# [0.13519168,yes,class=1st,class=2nd].
# 
# ----------------------------------------------------
# % class: yes seed: 1 round: 7 max: 0.13519168 lives: 0
# 
# % candidate: yes,class=1st
# % candidate: yes,class=1st,class=2nd
# % candidate: yes,class=1st,class=2nd,sex=female
# % candidate: yes,class=1st,sex=female
# % candidate: yes,class=2nd
# % candidate: yes,sex=female
# 
# [0.022196665,yes,class=1st,sex=female].
# [0.030848353,yes,class=2nd].
# [0.056260989,yes,class=1st,class=2nd,sex=female].
# [0.05809188,yes,class=1st].
# [0.12507705,yes,sex=female].
# [0.13519168,yes,class=1st,class=2nd].

BEGIN  {
	Seed=1          # Random number see.
    More = 1.02;    # Improvement means at least a 2% growth
    #Lives=5;        # If no improvement after five rounds, give up
    Dull=0.1;       # Ignore candidates with score < Dull*MaxScore
    #Beam=15;        # Only the top (say) 10 candidates survive to the next round
    #Samples=50;     # Pick this number of pairs of candidates from the last round
    Pinch = 1/1000; # Add a random number of up to "Pinch" to each score
	OverFitted=3;   # When do we prune a rule that matches on too few instances?
    CONVFMT="%.8g"; # Increase the string size for array contents so we can see the Pinch
    IGNORECASE=1; 
    SUBSEP = "=";
    _ = SUBSEP
	OFS=",";
    C=","
    #Verbose=0       # Verbose = 0 means silence
}

## --------------------------------------------------
#Data entry. Pretty routine stuff. 
/@attribute/ {Name[++Name[0]]=$2; Name[$2] = Name[0]}
             {gsub(/[ \t]*/,"")} # no blanks 
             {gsub(/%.*/,"")}    # no comments
/^$/         {next}              # no blank likes
/@data/      {In=1;FS=","; srand(Seed)}
/@/          {next}
In           {Rows++; 
              train(All,H,Rows,Data,F,$NF)}
END          { learn(All,H,Rows,Data,F) }

function train(all,h,row,d,f,class,   what,i) {
    h[class]++
    for(i=1;i<=NF;i++)  {
        if ($i == "?") 
            continue;
        what = Name[i]
        d[row,what]=$i
        all[what,$i]++
        if (i != NF) 
            f[class,what,$i]++ }
}

# Now we can begin. Try learning rules for each hypothesis.
function learn(all,h,rows,data,f,   class) {
    for(class in h) 
        learn1(class,all,h,rows,data,f)
}
# In round0, offer a rough ranking of
# the ranges. In subequent rounds, randomly select and combine ranges
# from prior randoms.
function learn1(class,all,h,rows,data,f,   which0,which) {
    round0(class,all,rows,f,h,which0); # make some initial guess
    #o(which0,"which0","-n -k 5")
    rounds(class,1,0,Lives,which0,rows,data,f,which)
	#exit
}

# In round one, score by b^2/(b+r); i.e. things more likely in
# the target class than otherwise (with some support weighting)
function round0(class,all,rows,f,h,which,  some,i,j,b,r,s,memo,score) {
    for(i in all) {
        some = f[class _ i]
        r    = (all[i] - some)/(rows  - h[class])
        b    = some / h[class]
        if (b > r) {
            j = class "," i
            s = b^2/(b+r) + rand()*Pinch
            memo[s] = j
            score[j]= s 
        }}
   chop(score,memo,which) # prune the dull candidates
}

# Given some score[key]=number and memo[number]=key,
# sort the scores and return the top Beam
# number of keys, pruning all keys less than
# Dull times the max score.
function chop(score0,memo,out,   score,n,i) {
    n=asort(score0,score)
    for(i=n;i>=1;i--) {
        if (score[i] <= score[n]*Dull)
            break;
        if (i <= n - Beam)
            break
        out[memo[score[i]]] = score[i]
    }
}

# In subsequent rounds one, score all the candidates
# by running that combination over the data (see the "score"
# function. Note the "score" paramter that caches prior
# calcuations of the score. This speeds up the code by
# a factor of four (for large data sets).
function rounds(class,round, max0,lives,which0,rows,data,f,out,score,   \
                max,i,sample,which1,s,memo,which2) {
    if (round == 1)
        max=0
    else { # terminate if we have stopped improving
        max = most(which0)    
        lives =  (max > (max0*More)) ? Lives : lives - 1
        if(lives < 0) { # if termination, copy input to out
            for(i in which0)
                out[i] = which0[i]
            return max }
    }
	#print "\n----------------------------------------------------"
#    print "% class: "  class " seed: " Seed \
#          " round: " round " max: " max " lives: " lives 
    normalize(which0)           # make all the counts n= 1..100
    explode(which0,sample)      # copy items n times
    twos(class,sample,Samples,which1) # pick items at random from that sample
    for(i in which0)            # add in the last rounds' ideas
        which1[i] = i;
    if (Verbose) values(which1,"candidate")
    for(i in which1) {          # score the new picks and the last rounds's ideas
        s = (i in score) ? score[i] : score(class,i,rows,data,f) + rand()*Pinch
        memo[s] = i
        score[i] = s
        }
    print DatasetName "," class "," Lives "," Beam "," Samples "," round "," most(which2), "\"" memo[s] "\""
    chop(score,memo,which2)     # prune the dull candidates
    if (Verbose) o(which2,"score","-n -k 1")
    return rounds(class,round+1,max,lives,which2,rows,data,f,out,score)
}

## -----------------------------------------
# Randomly pick pairs and combine them. Note that,
# in the following code, the picks come from "sample"
# where an item may be repeated many times (so things
# that are often repeated are more likely to be picked).

# "n" times, pick two things from "sample"
# and store them in "sampled". note htat
# the combined rules all start with the target class
function twos(class,sample,n,sampled,  pair) {
    while(n--) {
        pair= two(class,sample)
        sampled[pair]=pair
        }
}

# Pick two things at random. Try not
# to pick the same thing twice. Return
# the combination of the two things you pick.
function two(class,sample, tries, this, that) {
    this = one(sample)
    if(tries == 9) # nine lives
        return this
    that = one(sample)
    if (this == that)
        return two(class,sample,tries + 1)
     else
        return combine(class,this,that)
}

# combine two rules. don't repeat any ranges.
# sort them so that all the ranges of the same
# feature fall together. Note that the frst item in
# a rule is the target class. prune those entries
# away (so they don not repeat themselves).
function combine(class,this,that,    n,i,used,tmp,out) {
    sub(/^[^,]*,/,"",this)
    sub(/^[^,]*,/,"",that)
    split(this "," that,tmp,",")
    n=asort(tmp)
    out=tmp[1]
    used[tmp[1]]=1
    for(i=1;i<=n;i++)
        if (!used[tmp[i]]) {
            out = out "," tmp[i]
            used[tmp[i]] = 1
        }
    return class "," out
}

## ---------------------------------
## score a rule by finding its matching rows in data.
function score(class,rule,rows,data,f,  
			 goal,row, col,a,b,c,d,triggered,pd,pf,prec,acc,support,s,fits) {
    a=b=c=d=Pinch # stop divide by zero errors
    goal=Name[Name[0]]
    for(row=1;row<=rows;row++) {
        triggered = matched(row,data,rule)
        if (data[row,goal] == class) {
            if (triggered) {d++} else {b++}
        } else {
            if (triggered) {c++} else {a++}
        }
    } 
    fits    = c + d
    pd      = d/(b+d)
    pf      = a/(a+c)
    prec    = d/(c+d)
    acc     = (a+d)/(a+b+c+d)
    support = (c+d)/(a+b+c+d)
    return score1(pd,pf,prec,acc,support,fits)
}
function score1(pd,pf,prec,acc,support,fits) {
    if (fits <= OverFitted)
       return 0

    if (Eval==1) return acc
    if (Eval==2) return 2 * pd * prec/(pd+prec)
    if (Eval==3) return 2 * pd * pf/(pd+pf)
    if (Eval==4) return support * 2 * pd * pf/(pd+pf)
    return support * 2 * pd * prec/(pd+prec)
}

# Given "this" of the form "f1_v1,f2_v2,...." see if "row" matches "this".
# Assumes that disjunctions are modeled as  configuous values from the
# same feature (this is gaurenteed by "combine"). Hence, whenever
# we move to a new feature, we need to check that at least of the values
# mentioned with the old feature was found.
function matched(row,data,this,    col,n,goals,pair,f0,f,status) {
    n=split(this,goals,",")
    for(col=2;col<=n;col++) {
        split(goals[col],pair,_)
        f = pair[1]
        status[f] += data[row,f] == pair[2]
        if (f0 && (f != f0) && !status[f0]) 
            return 0
        f0 = f
    }
    return status[f]
}

## --------------------------------------
## interesting utils

# Given an array a[i]=n,
# fill up "out" with "n" number
# of "i". This creates a "sample" of
# things, from which we can pick randomly
# biased by the relative frequencies of "n".
# The total size of the sample is stored
# in "sample[0]"
function explode(a, out,i,j) {
    for(i in a)
        for(j=1;j<=a[i];j++)
            out[++out[0]] = i
}

# Pick any item at random from "sample".
# Assumes that the same size is in array
# element "sample[0]"
function one(sample, any) {
    any = int(rand()* sample[0])+1
    return sample[any]
}

## --------------------------------------
## boring utils

# Given an array a[i]=num, normalize
# all the numbers to integers 0..100
function normalize(a, i,sum) {
    for(i in a) sum += a[i]
    for(i in a) a[i] = int(100*a[i]/sum)
}

# combine an feature/ range
function fv(f,v) { return f _ v }

# find the max item in an array
function most(a,   i,max) {
    max = -1000000000
    for(i in a)
        if (a[i] > max)
            max = a[i];
    return max
}

# print array values
function values(a,s,what,  i,com) {
	print ""
    com = what ? "sort " what : "sort"
    for(i in a) 
        print "% " s": " a[i] | com;
    close(com)
 }	
# print an array, sorted by "what"
function o(a,s,what,   i,com) {
	print ""
    com = what ? "sort -t, " what : "sort -t, "
    for(i in a) 
        print "["a[i] ","i"]." | com;
    close(com)
 }