#!/usr/bin/gawk -f ########################################################################## # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . ########################################################################## # which2 : a stochastic anytime rule learner # (c) Tim Menzies (tim@menzies.us) 2010, LGLP 3.0 # This program builds rules by ranking ideas, then repeatedly building new ideas # by picking # and combining two old ideas (favoring those with higher ranks). New # ideas generated in this way are ranked and thrown back into the same pot as # the old ideas so, if they are any good, they might be picked and extended # in subsequent rounds. Alternatively, if the new idea stinks, it gets buried # by the better ideas and is ignore. # One important aspect of the following is that the scoring routines for # ideas are completely seperate from the rest of the code (see the "score1" # function). Hence, it is a simple # matter to try our different search biases. # e.g. This call produces the following output. # gawk -f which2.awk titanic.arff # In the following, the "candidates" are ideas that look promsing # and "score" ranks the candidates. If the max score does not improve # from the last round, then "lives" decreases. # Each round tries random combinations of the stuff from prior rounds # (favoring those things with higher scores). Hence, at round 1, # all the candidates are singletons. But. later on (see line 54) # the candidates can grow to combinations of things. # Each round prunes the candiates so that only the better candiadtes # surive to round+1. # The final output are the candidates of the last round. # ---------------------------------------------------- # % class: yes seed: 1 round: 1 max: 0 lives: 5 # # % candidate: yes,age=child # % candidate: yes,age=child,class=1st # % candidate: yes,age=child,sex=female # % candidate: yes,class=1st # % candidate: yes,class=1st,class=2nd # % candidate: yes,class=1st,sex=female # % candidate: yes,class=2nd # % candidate: yes,class=2nd,sex=female # % candidate: yes,sex=female # # [0.022196665,yes,class=1st,sex=female]. # [0.030848353,yes,class=2nd]. # [0.05809188,yes,class=1st]. # [0.12507705,yes,sex=female]. # [0.13519168,yes,class=1st,class=2nd]. # # ---------------------------------------------------- # % class: yes seed: 1 round: 2 max: 0.13519168 lives: 5 # # % candidate: yes,class=1st # % candidate: yes,class=1st,class=2nd # % candidate: yes,class=1st,class=2nd,sex=female # % candidate: yes,class=1st,sex=female # % candidate: yes,class=2nd # % candidate: yes,class=2nd,sex=female # % candidate: yes,sex=female # # [0.022196665,yes,class=1st,sex=female]. # [0.030848353,yes,class=2nd]. # [0.056260989,yes,class=1st,class=2nd,sex=female]. # [0.05809188,yes,class=1st]. # [0.12507705,yes,sex=female]. # [0.13519168,yes,class=1st,class=2nd]. # # ---------------------------------------------------- # % class: yes seed: 1 round: 3 max: 0.13519168 lives: 4 # # % candidate: yes,class=1st # % candidate: yes,class=1st,class=2nd # % candidate: yes,class=1st,class=2nd,sex=female # % candidate: yes,class=1st,sex=female # % candidate: yes,class=2nd # % candidate: yes,class=2nd,sex=female # % candidate: yes,sex=female # # [0.022196665,yes,class=1st,sex=female]. # [0.030848353,yes,class=2nd]. # [0.056260989,yes,class=1st,class=2nd,sex=female]. # [0.05809188,yes,class=1st]. # [0.12507705,yes,sex=female]. # [0.13519168,yes,class=1st,class=2nd]. # # ---------------------------------------------------- # % class: yes seed: 1 round: 4 max: 0.13519168 lives: 3 # # % candidate: yes,class=1st # % candidate: yes,class=1st,class=2nd # % candidate: yes,class=1st,class=2nd,sex=female # % candidate: yes,class=1st,sex=female # % candidate: yes,class=2nd # % candidate: yes,class=2nd,sex=female # % candidate: yes,sex=female # # [0.022196665,yes,class=1st,sex=female]. # [0.030848353,yes,class=2nd]. # [0.056260989,yes,class=1st,class=2nd,sex=female]. # [0.05809188,yes,class=1st]. # [0.12507705,yes,sex=female]. # [0.13519168,yes,class=1st,class=2nd]. # # ---------------------------------------------------- # % class: yes seed: 1 round: 5 max: 0.13519168 lives: 2 # # % candidate: yes,class=1st # % candidate: yes,class=1st,class=2nd # % candidate: yes,class=1st,class=2nd,sex=female # % candidate: yes,class=1st,sex=female # % candidate: yes,class=2nd # % candidate: yes,class=2nd,sex=female # % candidate: yes,sex=female # # [0.022196665,yes,class=1st,sex=female]. # [0.030848353,yes,class=2nd]. # [0.056260989,yes,class=1st,class=2nd,sex=female]. # [0.05809188,yes,class=1st]. # [0.12507705,yes,sex=female]. # [0.13519168,yes,class=1st,class=2nd]. # # ---------------------------------------------------- # % class: yes seed: 1 round: 6 max: 0.13519168 lives: 1 # # % candidate: yes,class=1st # % candidate: yes,class=1st,class=2nd # % candidate: yes,class=1st,class=2nd,sex=female # % candidate: yes,class=1st,sex=female # % candidate: yes,class=2nd # % candidate: yes,class=2nd,sex=female # % candidate: yes,sex=female # # [0.022196665,yes,class=1st,sex=female]. # [0.030848353,yes,class=2nd]. # [0.056260989,yes,class=1st,class=2nd,sex=female]. # [0.05809188,yes,class=1st]. # [0.12507705,yes,sex=female]. # [0.13519168,yes,class=1st,class=2nd]. # # ---------------------------------------------------- # % class: yes seed: 1 round: 7 max: 0.13519168 lives: 0 # # % candidate: yes,class=1st # % candidate: yes,class=1st,class=2nd # % candidate: yes,class=1st,class=2nd,sex=female # % candidate: yes,class=1st,sex=female # % candidate: yes,class=2nd # % candidate: yes,sex=female # # [0.022196665,yes,class=1st,sex=female]. # [0.030848353,yes,class=2nd]. # [0.056260989,yes,class=1st,class=2nd,sex=female]. # [0.05809188,yes,class=1st]. # [0.12507705,yes,sex=female]. # [0.13519168,yes,class=1st,class=2nd]. BEGIN { Seed=1 # Random number see. More = 1.02; # Improvement means at least a 2% growth #Lives=5; # If no improvement after five rounds, give up Dull=0.1; # Ignore candidates with score < Dull*MaxScore #Beam=15; # Only the top (say) 10 candidates survive to the next round #Samples=50; # Pick this number of pairs of candidates from the last round Pinch = 1/1000; # Add a random number of up to "Pinch" to each score OverFitted=3; # When do we prune a rule that matches on too few instances? CONVFMT="%.8g"; # Increase the string size for array contents so we can see the Pinch IGNORECASE=1; SUBSEP = "="; _ = SUBSEP OFS=","; C="," #Verbose=0 # Verbose = 0 means silence } ## -------------------------------------------------- #Data entry. Pretty routine stuff. /@attribute/ {Name[++Name[0]]=$2; Name[$2] = Name[0]} {gsub(/[ \t]*/,"")} # no blanks {gsub(/%.*/,"")} # no comments /^$/ {next} # no blank likes /@data/ {In=1;FS=","; srand(Seed)} /@/ {next} In {Rows++; train(All,H,Rows,Data,F,$NF)} END { learn(All,H,Rows,Data,F) } function train(all,h,row,d,f,class, what,i) { h[class]++ for(i=1;i<=NF;i++) { if ($i == "?") continue; what = Name[i] d[row,what]=$i all[what,$i]++ if (i != NF) f[class,what,$i]++ } } # Now we can begin. Try learning rules for each hypothesis. function learn(all,h,rows,data,f, class) { for(class in h) learn1(class,all,h,rows,data,f) } # In round0, offer a rough ranking of # the ranges. In subequent rounds, randomly select and combine ranges # from prior randoms. function learn1(class,all,h,rows,data,f, which0,which) { round0(class,all,rows,f,h,which0); # make some initial guess #o(which0,"which0","-n -k 5") rounds(class,1,0,Lives,which0,rows,data,f,which) #exit } # In round one, score by b^2/(b+r); i.e. things more likely in # the target class than otherwise (with some support weighting) function round0(class,all,rows,f,h,which, some,i,j,b,r,s,memo,score) { for(i in all) { some = f[class _ i] r = (all[i] - some)/(rows - h[class]) b = some / h[class] if (b > r) { j = class "," i s = b^2/(b+r) + rand()*Pinch memo[s] = j score[j]= s }} chop(score,memo,which) # prune the dull candidates } # Given some score[key]=number and memo[number]=key, # sort the scores and return the top Beam # number of keys, pruning all keys less than # Dull times the max score. function chop(score0,memo,out, score,n,i) { n=asort(score0,score) for(i=n;i>=1;i--) { if (score[i] <= score[n]*Dull) break; if (i <= n - Beam) break out[memo[score[i]]] = score[i] } } # In subsequent rounds one, score all the candidates # by running that combination over the data (see the "score" # function. Note the "score" paramter that caches prior # calcuations of the score. This speeds up the code by # a factor of four (for large data sets). function rounds(class,round, max0,lives,which0,rows,data,f,out,score, \ max,i,sample,which1,s,memo,which2) { if (round == 1) max=0 else { # terminate if we have stopped improving max = most(which0) lives = (max > (max0*More)) ? Lives : lives - 1 if(lives < 0) { # if termination, copy input to out for(i in which0) out[i] = which0[i] return max } } #print "\n----------------------------------------------------" # print "% class: " class " seed: " Seed \ # " round: " round " max: " max " lives: " lives normalize(which0) # make all the counts n= 1..100 explode(which0,sample) # copy items n times twos(class,sample,Samples,which1) # pick items at random from that sample for(i in which0) # add in the last rounds' ideas which1[i] = i; if (Verbose) values(which1,"candidate") for(i in which1) { # score the new picks and the last rounds's ideas s = (i in score) ? score[i] : score(class,i,rows,data,f) + rand()*Pinch memo[s] = i score[i] = s } print DatasetName "," class "," Lives "," Beam "," Samples "," round "," most(which2), "\"" memo[s] "\"" chop(score,memo,which2) # prune the dull candidates if (Verbose) o(which2,"score","-n -k 1") return rounds(class,round+1,max,lives,which2,rows,data,f,out,score) } ## ----------------------------------------- # Randomly pick pairs and combine them. Note that, # in the following code, the picks come from "sample" # where an item may be repeated many times (so things # that are often repeated are more likely to be picked). # "n" times, pick two things from "sample" # and store them in "sampled". note htat # the combined rules all start with the target class function twos(class,sample,n,sampled, pair) { while(n--) { pair= two(class,sample) sampled[pair]=pair } } # Pick two things at random. Try not # to pick the same thing twice. Return # the combination of the two things you pick. function two(class,sample, tries, this, that) { this = one(sample) if(tries == 9) # nine lives return this that = one(sample) if (this == that) return two(class,sample,tries + 1) else return combine(class,this,that) } # combine two rules. don't repeat any ranges. # sort them so that all the ranges of the same # feature fall together. Note that the frst item in # a rule is the target class. prune those entries # away (so they don not repeat themselves). function combine(class,this,that, n,i,used,tmp,out) { sub(/^[^,]*,/,"",this) sub(/^[^,]*,/,"",that) split(this "," that,tmp,",") n=asort(tmp) out=tmp[1] used[tmp[1]]=1 for(i=1;i<=n;i++) if (!used[tmp[i]]) { out = out "," tmp[i] used[tmp[i]] = 1 } return class "," out } ## --------------------------------- ## score a rule by finding its matching rows in data. function score(class,rule,rows,data,f, goal,row, col,a,b,c,d,triggered,pd,pf,prec,acc,support,s,fits) { a=b=c=d=Pinch # stop divide by zero errors goal=Name[Name[0]] for(row=1;row<=rows;row++) { triggered = matched(row,data,rule) if (data[row,goal] == class) { if (triggered) {d++} else {b++} } else { if (triggered) {c++} else {a++} } } fits = c + d pd = d/(b+d) pf = a/(a+c) prec = d/(c+d) acc = (a+d)/(a+b+c+d) support = (c+d)/(a+b+c+d) return score1(pd,pf,prec,acc,support,fits) } function score1(pd,pf,prec,acc,support,fits) { if (fits <= OverFitted) return 0 if (Eval==1) return acc if (Eval==2) return 2 * pd * prec/(pd+prec) if (Eval==3) return 2 * pd * pf/(pd+pf) if (Eval==4) return support * 2 * pd * pf/(pd+pf) return support * 2 * pd * prec/(pd+prec) } # Given "this" of the form "f1_v1,f2_v2,...." see if "row" matches "this". # Assumes that disjunctions are modeled as configuous values from the # same feature (this is gaurenteed by "combine"). Hence, whenever # we move to a new feature, we need to check that at least of the values # mentioned with the old feature was found. function matched(row,data,this, col,n,goals,pair,f0,f,status) { n=split(this,goals,",") for(col=2;col<=n;col++) { split(goals[col],pair,_) f = pair[1] status[f] += data[row,f] == pair[2] if (f0 && (f != f0) && !status[f0]) return 0 f0 = f } return status[f] } ## -------------------------------------- ## interesting utils # Given an array a[i]=n, # fill up "out" with "n" number # of "i". This creates a "sample" of # things, from which we can pick randomly # biased by the relative frequencies of "n". # The total size of the sample is stored # in "sample[0]" function explode(a, out,i,j) { for(i in a) for(j=1;j<=a[i];j++) out[++out[0]] = i } # Pick any item at random from "sample". # Assumes that the same size is in array # element "sample[0]" function one(sample, any) { any = int(rand()* sample[0])+1 return sample[any] } ## -------------------------------------- ## boring utils # Given an array a[i]=num, normalize # all the numbers to integers 0..100 function normalize(a, i,sum) { for(i in a) sum += a[i] for(i in a) a[i] = int(100*a[i]/sum) } # combine an feature/ range function fv(f,v) { return f _ v } # find the max item in an array function most(a, i,max) { max = -1000000000 for(i in a) if (a[i] > max) max = a[i]; return max } # print array values function values(a,s,what, i,com) { print "" com = what ? "sort " what : "sort" for(i in a) print "% " s": " a[i] | com; close(com) } # print an array, sorted by "what" function o(a,s,what, i,com) { print "" com = what ? "sort -t, " what : "sort -t, " for(i in a) print "["a[i] ","i"]." | com; close(com) }