#!/sw/bin/gawk -f
##########################################################################
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see .
##########################################################################
# which2 : a stochastic anytime rule learner
# (c) Tim Menzies (tim@menzies.us) 2010, LGLP 3.0
# This program builds rules by ranking ideas, then repeatedly building new ideas
# by picking # and combining two old ideas (favoring those with higher ranks). New
# ideas generated in this way are ranked and thrown back into the same pot as
# the old ideas so, if they are any good, they might be picked and extended
# in subsequent rounds. Alternatively, if the new idea stinks, it gets buried
# by the better ideas and is ignore.
# One important aspect of the following is that the scoring routines for
# ideas are completely seperate from the rest of the code (see the "score1"
# function). Hence, it is a simple # matter to try our different search biases.
# e.g. This call produces the following output.
# gawk -f which2.awk titanic.arff
# In the following, the "candidates" are ideas that look promsing
# and "score" ranks the candidates. If the max score does not improve
# from the last round, then "lives" decreases.
# Each round tries random combinations of the stuff from prior rounds
# (favoring those things with higher scores). Hence, at round 1,
# all the candidates are singletons. But. later on (see line 54)
# the candidates can grow to combinations of things.
# Each round prunes the candiates so that only the better candiadtes
# surive to round+1.
# The final output are the candidates of the last round (see lines
# 169 to 178). In this example, the best rule is feature3=female.
# To read the following, note that "1 1st" means "feature 1 = 1st".
# 1
# 2 ----------------
# 3 round 1 max 0 lives 5
# 4
# 5 candidate[ 1,1st ] = 11st
# 6 candidate[ 1,1st,1,2nd ] = 11st,12nd
# 7 candidate[ 1,1st,2,child ] = 11st,2child
# 8 candidate[ 1,1st,3,female ] = 11st,3female
# 9 candidate[ 1,2nd ] = 12nd
# 10 candidate[ 1,2nd,3,female ] = 12nd,3female
# 11 candidate[ 2,child ] = 2child
# 12 candidate[ 3,female ] = 3female
# 13
# 14 score[ 1,2nd ] = 0.65497458
# 15 score[ 2,child ] = 0.6797711
# 16 score[ 1,1st,2,child ] = 0.68008855
# 17 score[ 1,1st,1,2nd ] = 0.69219043
# 18 score[ 1,1st ] = 0.71384384
# 19 score[ 1,2nd,3,female ] = 0.71400064
# 20 score[ 1,1st,3,female ] = 0.73924712
# 21 score[ 3,female ] = 0.77617045
# 22
# 23 ----------------
# 24 round 2 max 0.77617045 lives 5
# 25
# 26 candidate[ 1,1st ] = 11st
# 27 candidate[ 1,1st,1,2nd ] = 11st,12nd
# 28 candidate[ 1,1st,1,2nd,2,child ] = 11st,12nd,2child
# 29 candidate[ 1,1st,1,2nd,3,female ] = 11st,12nd,3female
# 30 candidate[ 1,1st,2,child ] = 11st,2child
# 31 candidate[ 1,1st,2,child,3,female ] = 11st,2child,3female
# 32 candidate[ 1,1st,3,female ] = 11st,3female
# 33 candidate[ 1,2nd ] = 12nd
# 34 candidate[ 1,2nd,3,female ] = 12nd,3female
# 35 candidate[ 2,child ] = 2child
# 36 candidate[ 3,female ] = 3female
# 37
# 38 score[ 1,1st,2,child,3,female ] = 0.67778641
# 39 score[ 2,child ] = 0.6797711
# 40 score[ 1,1st,2,child ] = 0.68008855
# 41 score[ 1,1st,1,2nd,2,child ] = 0.69146934
# 42 score[ 1,1st,1,2nd ] = 0.69219043
# 43 score[ 1,1st ] = 0.71384384
# 44 score[ 1,2nd,3,female ] = 0.71400064
# 45 score[ 1,1st,3,female ] = 0.73924712
# 46 score[ 1,1st,1,2nd,3,female ] = 0.77596332
# 47 score[ 3,female ] = 0.77617045
# 48
# 49 ----------------
# 50 round 3 max 0.77617045 lives 4
# 51
# 52 candidate[ 1,1st ] = 11st
# 53 candidate[ 1,1st,1,2nd ] = 11st,12nd
# 54 candidate[ 1,1st,1,2nd,2,child ] = 11st,12nd,2child
# 55 candidate[ 1,1st,1,2nd,2,child,3,female ] = 11st,12nd,2child,3female
# 56 candidate[ 1,1st,1,2nd,3,female ] = 11st,12nd,3female
# 57 candidate[ 1,1st,2,child ] = 11st,2child
# 58 candidate[ 1,1st,2,child,3,female ] = 11st,2child,3female
# 59 candidate[ 1,1st,3,female ] = 11st,3female
# 60 candidate[ 1,2nd,3,female ] = 12nd,3female
# 61 candidate[ 2,child ] = 2child
# 62 candidate[ 2,child,3,female ] = 2child,3female
# 63 candidate[ 3,female ] = 3female
# 64
# 65 score[ 1,1st,2,child ] = 0.68008855
# 66 score[ 2,child,3,female ] = 0.6821201
# 67 score[ 1,1st,1,2nd,2,child,3,female ] = 0.68397115
# 68 score[ 1,1st,1,2nd,2,child ] = 0.69146934
# 69 score[ 1,1st,1,2nd ] = 0.69219043
# 70 score[ 1,1st ] = 0.71384384
# 71 score[ 1,2nd,3,female ] = 0.71400064
# 72 score[ 1,1st,3,female ] = 0.73924712
# 73 score[ 1,1st,1,2nd,3,female ] = 0.77596332
# 74 score[ 3,female ] = 0.77617045
# 75
# 76 ----------------
# 77 round 4 max 0.77617045 lives 3
# 78
# 79 candidate[ 1,1st ] = 11st
# 80 candidate[ 1,1st,1,2nd ] = 11st,12nd
# 81 candidate[ 1,1st,1,2nd,2,child ] = 11st,12nd,2child
# 82 candidate[ 1,1st,1,2nd,2,child,3,female ] = 11st,12nd,2child,3female
# 83 candidate[ 1,1st,1,2nd,3,female ] = 11st,12nd,3female
# 84 candidate[ 1,1st,2,child ] = 11st,2child
# 85 candidate[ 1,1st,2,child,3,female ] = 11st,2child,3female
# 86 candidate[ 1,1st,3,female ] = 11st,3female
# 87 candidate[ 1,2nd,3,female ] = 12nd,3female
# 88 candidate[ 2,child,3,female ] = 2child,3female
# 89 candidate[ 3,female ] = 3female
# 90
# 91 score[ 1,1st,2,child ] = 0.68008855
# 92 score[ 2,child,3,female ] = 0.6821201
# 93 score[ 1,1st,1,2nd,2,child,3,female ] = 0.68397115
# 94 score[ 1,1st,1,2nd,2,child ] = 0.69146934
# 95 score[ 1,1st,1,2nd ] = 0.69219043
# 96 score[ 1,1st ] = 0.71384384
# 97 score[ 1,2nd,3,female ] = 0.71400064
# 98 score[ 1,1st,3,female ] = 0.73924712
# 99 score[ 1,1st,1,2nd,3,female ] = 0.77596332
# 100 score[ 3,female ] = 0.77617045
# 101
# 102 ----------------
# 103 round 5 max 0.77617045 lives 2
# 104
# 105 candidate[ 1,1st ] = 11st
# 106 candidate[ 1,1st,1,2nd ] = 11st,12nd
# 107 candidate[ 1,1st,1,2nd,2,child ] = 11st,12nd,2child
# 108 candidate[ 1,1st,1,2nd,2,child,3,female ] = 11st,12nd,2child,3female
# 109 candidate[ 1,1st,1,2nd,3,female ] = 11st,12nd,3female
# 110 candidate[ 1,1st,2,child ] = 11st,2child
# 111 candidate[ 1,1st,2,child,3,female ] = 11st,2child,3female
# 112 candidate[ 1,1st,3,female ] = 11st,3female
# 113 candidate[ 1,2nd,3,female ] = 12nd,3female
# 114 candidate[ 2,child,3,female ] = 2child,3female
# 115 candidate[ 3,female ] = 3female
# 116
# 117 score[ 1,1st,2,child ] = 0.68008855
# 118 score[ 2,child,3,female ] = 0.6821201
# 119 score[ 1,1st,1,2nd,2,child,3,female ] = 0.68397115
# 120 score[ 1,1st,1,2nd,2,child ] = 0.69146934
# 121 score[ 1,1st,1,2nd ] = 0.69219043
# 122 score[ 1,1st ] = 0.71384384
# 123 score[ 1,2nd,3,female ] = 0.71400064
# 124 score[ 1,1st,3,female ] = 0.73924712
# 125 score[ 1,1st,1,2nd,3,female ] = 0.77596332
# 126 score[ 3,female ] = 0.77617045
# 127
# 128 ----------------
# 129 round 6 max 0.77617045 lives 1
# 130
# 131 candidate[ 1,1st ] = 11st
# 132 candidate[ 1,1st,1,2nd ] = 11st,12nd
# 133 candidate[ 1,1st,1,2nd,2,child ] = 11st,12nd,2child
# 134 candidate[ 1,1st,1,2nd,2,child,3,female ] = 11st,12nd,2child,3female
# 135 candidate[ 1,1st,1,2nd,3,female ] = 11st,12nd,3female
# 136 candidate[ 1,1st,2,child ] = 11st,2child
# 137 candidate[ 1,1st,2,child,3,female ] = 11st,2child,3female
# 138 candidate[ 1,1st,3,female ] = 11st,3female
# 139 candidate[ 1,2nd,3,female ] = 12nd,3female
# 140 candidate[ 2,child,3,female ] = 2child,3female
# 141 candidate[ 3,female ] = 3female
# 142
# 143 score[ 1,1st,2,child ] = 0.68008855
# 144 score[ 2,child,3,female ] = 0.6821201
# 145 score[ 1,1st,1,2nd,2,child,3,female ] = 0.68397115
# 146 score[ 1,1st,1,2nd,2,child ] = 0.69146934
# 147 score[ 1,1st,1,2nd ] = 0.69219043
# 148 score[ 1,1st ] = 0.71384384
# 149 score[ 1,2nd,3,female ] = 0.71400064
# 150 score[ 1,1st,3,female ] = 0.73924712
# 151 score[ 1,1st,1,2nd,3,female ] = 0.77596332
# 152 score[ 3,female ] = 0.77617045
# 153
# 154 ----------------
# 155 round 7 max 0.77617045 lives 0
# 156
# 157 candidate[ 1,1st ] = 11st
# 158 candidate[ 1,1st,1,2nd ] = 11st,12nd
# 159 candidate[ 1,1st,1,2nd,2,child ] = 11st,12nd,2child
# 160 candidate[ 1,1st,1,2nd,2,child,3,female ] = 11st,12nd,2child,3female
# 161 candidate[ 1,1st,1,2nd,3,female ] = 11st,12nd,3female
# 162 candidate[ 1,1st,2,child ] = 11st,2child
# 163 candidate[ 1,1st,2,child,3,female ] = 11st,2child,3female
# 164 candidate[ 1,1st,3,female ] = 11st,3female
# 165 candidate[ 1,2nd,3,female ] = 12nd,3female
# 166 candidate[ 2,child,3,female ] = 2child,3female
# 167 candidate[ 3,female ] = 3female
# 168
# 169 score[ 1,1st,2,child ] = 0.68008855
# 170 score[ 2,child,3,female ] = 0.6821201
# 171 score[ 1,1st,1,2nd,2,child,3,female ] = 0.68397115
# 172 score[ 1,1st,1,2nd,2,child ] = 0.69146934
# 173 score[ 1,1st,1,2nd ] = 0.69219043
# 174 score[ 1,1st ] = 0.71384384
# 175 score[ 1,2nd,3,female ] = 0.71400064
# 176 score[ 1,1st,3,female ] = 0.73924712
# 177 score[ 1,1st,1,2nd,3,female ] = 0.77596332
# 178 score[ 3,female ] = 0.77617045
BEGIN {
Goal="yes"; # Best = the Goal class. Rest = anything else
Seed=1 # Random number see.
More = 1.02; # Improvement means at least a 2% growth
Lives=5; # If no improvement after five rounds, give up
Dull=0.1; # Ignore candidates with score < Dull*MaxScore
Beam=10; # Only the top (say) 10 candidates survive to the next round
Samples=20; # Pick this number of pairs of candidates from the last round
Pinch = 1/1000; # Add a random number of up to "Pinch" to each score
OverFitted=3; # When do we prune a rule that matches on too few instances?
CONVFMT="%.8g"; # Increase the string size for array contents so we can see the Pinch
IGNORECASE=1;
_ = SUBSEP;
C=","
}
## --------------------------------------------------
#Data entry. Pretty routine stuff.
/@attribute/ {Name[++Name[0]]=$2}
{gsub(/[ \t]*/,"")} # no blanks
{gsub(/#.*/,"")} # no comments
/^$/ {next} # no blank likes
/@data/ {In=1;FS=","; srand(Seed)}
/@/ {next}
In {Rows++;
train(All,Best,H,Rows,Data,$NF==Goal)}
function train(all,best,h,rows,d,class, i) {
h[class]++
for(i=1;i<=NF;i++) {
if ($i == "?")
continue;
if (i == NF) {
d[rows,i]=class
} else {
d[rows,i]=$i
all[i,$i]++
if (class)
best[i,$i]++ }}
}
# Now we can begin. In round0, offer a rough ranking of
# the ranges. In subequent rounds, randomly select and combine ranges
# from prior randoms.
END {round0(All,Best,H,Hot0); # make some initial guess
rounds(1,0,Lives,Hot0,Rows,Data,Out)
}
# In round one, score by b^/(b+r)
function round0(all,best,h,hot, i,b,r,memo,score) {
for(i in best) {
r = (all[i] - best[i])/h[0]
b = best[i]/h[1]
if (b > r) {
s = b^2/(b+r) + rand()*Pinch
memo[s] = i
score[i]= s
}}
chop(score,memo,hot) # prune the dull candidates
}
# Given some score[key]=number and memo[number]=key,
# sort the scores and return the top Beam
# number of keys, pruning all keys less than
# Dull times the max score.
function chop(score0,memo,out, score,n,i) {
n=asort(score0,score)
for(i=n;i>=1;i--) {
if (score[i] <= score[n]*Dull)
break;
if (i <= n - Beam)
break
out[memo[score[i]]] = score[i]
}
}
# In subsequent rounds one, score all the candidates
# by running that combination over the data (see the "score"
# function. Note the "score" paramter that caches prior
# calcuations of the score. This speeds up the code by
# a factor of four (for large data sets).
function rounds(round, max0,lives,hot0,rows,data,out,score, \
max,i,sample,hot1,s,memo,hot2) {
if (round == 1)
max=0
else { # terminate if we have stopped improving
max = most(hot0)
lives = (max > (max0*More)) ? Lives : lives - 1
if(lives < 0) { # if termination, copy input to out
for(i in hot0)
out[i] = hot0[i]
return max }
}
print "\n---| round: " round " seed: " Seed " |-----------------------------------"
print "max : " max "\nlives : " lives
normalize(hot0) # make all the counts n= 1..100
explode(hot0,sample) # copy items n times
twos(sample,Samples,hot1) # pick items at random from that sample
for(i in hot0) # add in the last rounds' ideas
hot1[i] = i;
values(hot1,"candidate")
for(i in hot1) { # score the new picks and the last rounds's ideas
s = (i in score) ? score[i] : score(i,rows,data) + rand()*Pinch
memo[s] = i
score[i] = s
}
chop(score,memo,hot2) # prune the dull candidates
o(hot2,"score","-n -k 5")
return rounds(round+1,max,lives,hot2,rows,data,out,score)
}
## -----------------------------------------
# Randomly pick pairs and combine them. Note that,
# in the following code, the picks come from "sample"
# where an item may be repeated many times (so things
# that are often repeated are more likely to be picked).
# "n" times, pick two things from "sample"
# and store them in "sampled"
function twos(sample,n,sampled, pair) {
while(n--) {
pair= two(sample)
sampled[pair]=pair
}
}
# Pick two things at random. Try not
# to pick the same thing twice. Return
# the combination of the two things you pick.
function two(sample, tries, this, that) {
this = one(sample)
if(tries == 9) # nine lives
return this
that = one(sample)
if (this == that)
return two(sample,tries + 1)
else
return combine(this,that)
}
# combine two rules. don't repeat any ranges.
# sort them so that all the ranges of the same
# feature fall together.
function combine(this,that, used,tmp,out) {
split(this "," that,tmp,",")
n=asort(tmp)
out = tmp[1]
used[tmp[1]]=1
for(i=2;i<=n;i++)
if (!used[tmp[i]]) {
out = out "," tmp[i]
used[tmp[i]] = 1
}
return out
}
## ---------------------------------
## score a rule by finding its matching rows in data.
function score(i,rows,data,
cols,row, col,a,b,c,d,triggered,pd,pf,prec,acc,supports,s,fits) {
a=b=c=d=Pinch # stop divide by zero errors
cols=Name[0]
for(row=1;row<=rows;row++) {
triggered = matched(row,cols,data,i)
if (data[row,cols]) {
if (triggered) {d++} else {b++}
} else {
if (triggered) {c++} else {a++}
}
}
fits = c + d
pd = d/(b+d)
pf = a/(a+c)
prec = d/(c+d)
acc = (a+d)/(a+b+c+d)
support = (c+d)/(a+b+c+d)
return score1(pd,pf,prec,acc,support,fits)
}
function score1(pd,pf,prec,acc,supportm,fits) {
if (fits <= OverFitted)
return 0
if (Eval==1) return acc
if (Eval==2) return 2 * pd * prec/(pd+prec)
if (Eval==3) return 2 * pd * pf/(pd+pf)
if (Eval==4) return support * 2 * pd * pf/(pd+pf)
return support * 2 * pd * prec/(pd+prec)
}
# Given "this" of the form "f1_v1,f2_v2,...." see if "row" matches "this".
# Assumes that disjunctions are modeled as configuous values from the
# same feature (this is gaurenteed by "combine"). Hence, whenever
# we move to a new feature, we need to check that at least of the values
# mentioned with the old feature was found.
function matched(row,cols,data,this, col,n,goals,pair,f0,f,status) {
n=split(this,goals,",")
for(col=1;col<=n;col++) {
split(goals[col],pair,_)
f = pair[1]
status[f] += data[row,f] == pair[2]
if (f0 && (f != f0) && !status[f0])
return 0
f0 = f
}
return status[f]
}
## --------------------------------------
## interesting utils
# Given an array a[i]=n,
# fill up "out" with "n" number
# of "i". This creates a "sample" of
# things, from which we can pick randomly
# biased by the relative frequencies of "n".
# The total size of the sample is stored
# in "sample[0]"
function explode(a, out,i,j) {
for(i in a)
for(j=1;j<=a[i];j++)
out[++out[0]] = i
}
# Pick any item at random from "sample".
# Assumes that the same size is in array
# element "sample[0]"
function one(sample, any) {
any = int(rand()* sample[0])+1
return sample[any]
}
## --------------------------------------
## boring utils
# Given an array a[i]=num, normalize
# all the numbers to integers 0..100
function normalize(a, i,sum) {
for(i in a) sum += a[i]
for(i in a) a[i] = int(100*a[i]/sum)
}
# combine an feature/ range
function fv(f,v) { return f _ v }
# find the max item in an array
function most(a, i,max) {
max = -1000000000
for(i in a)
if (a[i] > max)
max = a[i];
return max
}
# print array values
function values(a,s,what, i,j,com) {
print ""
what = what ? what : ""
com = "sort " what
for(i in a) {
j=a[i];
gsub(_,",",j);
print s": " j | com;
}
close(com)
}
# print an array, sorted by "what"
function o(a,s,what, i,j,com) {
print ""
what = what ? what : ""
com = "sort " what
for(i in a) {
j=i;
gsub(_,",",j);
print s"[ "j" ] = " a[i] | com;
}
close(com)
}