#!/sw/bin/gawk -f 

##########################################################################
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU Lesser General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU Lesser General Public License for more details.
#
#    You should have received a copy of the GNU Lesser General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
##########################################################################

# whichn2 : a stochastic anytime rule-based optimizer
# (c) Tim Menzies (tim@menzies.us) 2010, LGLP 3.0 

# This program builds rules by ranking ideas, then repeatedly building new ideas
# by picking # and combining two old ideas (favoring those with higher ranks).  New
# ideas generated in this way are ranked and thrown back into the same pot as
# the old ideas so, if they are any good, they might be picked and extended
# in subsequent rounds. Alternatively, if the new idea stinks, it gets buried
# by the better ideas and is ignore.

# One important aspect of the following is that the scoring routines for
# ideas are completely seperate from the rest of the code (see the "score1"
# function). Hence, it is a simple # matter to try our different search biases.

# e.g. This call produces the following output.
# gawk -f which2n.awk titanic.arff 

# In the following, the "candidates" are ideas that look promsing
# and "score" ranks the candidates. If the max score does not improve
# from the last round, then "lives" decreases.

# Each round tries random combinations of the stuff from prior rounds 
# (favoring those things with higher scores). Hence, at round 1,
# all the candidates are singletons. But. later on (see line 54)
# the candidates can grow to combinations of things.

# Each round prunes the candiates so that only the better candiadtes
# surive to round+1.

BEGIN  {
	Seed=1          # Random number see.
    More = 1.02;    # Improvement means at least a 2% growth
    Lives=5;        # If no improvement after five rounds, give up
    Dull=0.1;       # Ignore candidates with score < Dull*MaxScore
    Beam=10;        # Only the top (say) 10 candidates survive to the next round
    Samples=20;     # Pick this number of pairs of candidates from the last round
    Pinch = 1/1000; # Add a random number of up to "Pinch" to each score
	OverFitted=3;   # When do we prune a rule that matches on too few instances?
    CONVFMT="%.8g"; # Increase the string size for array contents so we can see the Pinch
    IGNORECASE=1; 
    _ = SUBSEP;
    C=","
}

## --------------------------------------------------
#Data entry. Pretty routine stuff. 
/@attribute/ {Name[++Name[0]]=$2}
             {gsub(/[ \t]*/,"")} # no blanks 
             {gsub(/#.*/,"")}    # no comments
/^$/         {next}              # no blank likes
/@data/      {In=1; FS=","; srand(Seed)}
/@/          {next}
In           {Rows++; 
              train(Rows,Data,Counts)}

function train(rows,d,c,   i) {
    for(i=1;i<=NF;i++)  {
        if ($i == "?") 
            continue;
         d[rows,i]=$i 
		 c[i,$i] += $NF
	 }
}

# Now we can begin. In round0, offer a rough ranking of
# the ranges. In subequent rounds, randomly select and combine ranges
# from prior randoms.
END   {
	   round0(Counts,Hot0); # make some initial guess
       rounds(1,0,Lives,Hot0,Rows,Data,Out)
      }

# In round one, score by b^/(b+r)
function round0(c,hot,  i,s,memo,score) {
    for(i in c) {
		   s = c[i]
           memo[s] = i
           score[i]= s 
        }
   chop(score,memo,hot) # prune the dull candidates
}

# Given some score[key]=number and memo[number]=key,
# sort the scores and return the top Beam
# number of keys, pruning all keys less than
# Dull times the max score.
function chop(score0,memo,out,   score,n,i) {
    n=asort(score0,score)
    for(i=n;i>=1;i--) {
        if (score[i] <= score[n]*Dull)
            break;
        if (i <= n - Beam)
            break
        out[memo[score[i]]] = score[i]
    }
}

# In subsequent rounds one, score all the candidates
# by running that combination over the data (see the "score"
# function. Note the "score" paramter that caches prior
# calcuations of the score. This speeds up the code by
# a factor of four (for large data sets).
function rounds(round, max0,lives,hot0,rows,data,out,score,   \
                max,i,sample,hot1,s,memo,hot2) {
    if (round == 1)
        max=0
    else { # terminate if we have stopped improving
        max = most(hot0)    
        lives =  (max > (max0*More)) ? Lives : lives - 1
        if(lives < 0) { # if termination, copy input to out
            for(i in hot0)
                out[i] = hot0[i]
            return max }
    }
    print "\n---| round: " round " seed: " Seed " |-----------------------------------"
    print "max   : " max "\nlives : " lives 
    normalize(hot0)           # make all the counts n= 1..100
    explode(hot0,sample)      # copy items n times
    twos(sample,Samples,hot1) # pick items at random from that sample
    for(i in hot0)            # add in the last rounds' ideas
        hot1[i] = i;
    values(hot1,"candidate")
    for(i in hot1) {          # score the new picks and the last rounds's ideas
        s = (i in score) ? score[i] : score(i,rows,data) + rand()*Pinch
        memo[s] = i
        score[i] = s
        }
    chop(score,memo,hot2)     # prune the dull candidates
    o(hot2,"score","-n -k 5")
    return rounds(round+1,max,lives,hot2,rows,data,out,score)
}

## -----------------------------------------
# Randomly pick pairs and combine them. Note that,
# in the following code, the picks come from "sample"
# where an item may be repeated many times (so things
# that are often repeated are more likely to be picked).

# "n" times, pick two things from "sample"
# and store them in "sampled"
function twos(sample,n,sampled,  pair) {
    while(n--) {
        pair= two(sample)
        sampled[pair]=pair
        }
}

# Pick two things at random. Try not
# to pick the same thing twice. Return
# the combination of the two things you pick.
function two(sample, tries, this, that) {
    this = one(sample)
    if(tries == 9) # nine lives
        return this
    that = one(sample)
    if (this == that)
        return two(sample,tries + 1)
     else
        return combine(this,that)
}

# combine two rules. don't repeat any ranges.
# sort them so that all the ranges of the same
# feature fall together.
function combine(this,that,   used,tmp,out) {
    split(this "," that,tmp,",")
    n=asort(tmp)
    out = tmp[1]
    used[tmp[1]]=1
    for(i=2;i<=n;i++)
        if (!used[tmp[i]]) {
            out = out "," tmp[i]
            used[tmp[i]] = 1
            }
    return out
}

## ---------------------------------
## score a rule by finding its matching rows in data.
function score(i,rows,data,   
			 cols,row,triggered,fits,sum,support) {
	fits=sum=Pinch
	cols=Name[0]
    for(row=1;row<=rows;row++) {
        triggered = matched(row,cols,data,i)
        if (triggered) {
			fits++
			sum += data[row,cols]
		}
    } 
    support = fits/rows
    return score1(fits,support,sum)
}
function score1(fits,support,s) {
	if (fits <= OverFitted)
		return 0
    if (Eval==1) return support * s
    return s
}

# Given "this" of the form "f1_v1,f2_v2,...." see if "row" matches "this".
# Assumes that disjunctions are modeled as  configuous values from the
# same feature (this is gaurenteed by "combine"). Hence, whenever
# we move to a new feature, we need to check that at least of the values
# mentioned with the old feature was found.
function matched(row,cols,data,this,    col,n,goals,pair,f0,f,status) {
    n=split(this,goals,",")
    for(col=1;col<=n;col++) {
        split(goals[col],pair,_)
        f = pair[1]
        status[f] += data[row,f] == pair[2]
        if (f0 && (f != f0) && !status[f0]) 
            return 0
        f0 = f
    }
    return status[f]
}

## --------------------------------------
## interesting utils

# Given an array a[i]=n,
# fill up "out" with "n" number
# of "i". This creates a "sample" of
# things, from which we can pick randomly
# biased by the relative frequencies of "n".
# The total size of the sample is stored
# in "sample[0]"
function explode(a, out,i,j) {
    for(i in a)
        for(j=1;j<=a[i];j++)
            out[++out[0]] = i
}

# Pick any item at random from "sample".
# Assumes that the same size is in array
# element "sample[0]"
function one(sample, any) {
    any = int(rand()* sample[0])+1
    return sample[any]
}

## --------------------------------------
## boring utils

# Given an array a[i]=num, normalize
# all the numbers to integers 0..100
function normalize(a, i,sum) {
    for(i in a) sum += a[i]
    for(i in a) a[i] = int(100*a[i]/sum)
}

# combine an feature/ range
function fv(f,v) { return f _ v }

# find the max item in an array
function most(a,   i,max) {
    max = -1000000000
    for(i in a)
        if (a[i] > max)
            max = a[i];
    return max
}

# print array values
function values(a,s,what,  i,j,com) {
	print ""
    what = what ? what : ""
    com = "sort " what
    for(i in a) {
        j=a[i];
        gsub(_,",",j);
        print s": " j | com;
    }
    close(com)
 }	
# print an array, sorted by "what"
function o(a,s,what,   i,j,com) {
   print ""
    what = what ? what : ""
    com = "sort " what
    for(i in a) {
        j=i;
        gsub(_,",",j);
        print s"[ "j" ] = " a[i] | com;
    }
    close(com)
 }