#!/sw/bin/gawk -f ########################################################################## # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . ########################################################################## # which2 : a stochastic anytime rule learner # (c) Tim Menzies (tim@menzies.us) 2010, LGLP 3.0 # This program builds rules by ranking ideas, then repeatedly building new ideas # by picking # and combining two old ideas (favoring those with higher ranks). New # ideas generated in this way are ranked and thrown back into the same pot as # the old ideas so, if they are any good, they might be picked and extended # in subsequent rounds. Alternatively, if the new idea stinks, it gets buried # by the better ideas and is ignore. # One important aspect of the following is that the scoring routines for # ideas are completely seperate from the rest of the code (see the "score1" # function). Hence, it is a simple # matter to try our different search biases. # e.g. This call produces the following output. # gawk -f which2.awk titanic.arff # In the following, the "candidates" are ideas that look promsing # and "score" ranks the candidates. If the max score does not improve # from the last round, then "lives" decreases. # Each round tries random combinations of the stuff from prior rounds # (favoring those things with higher scores). Hence, at round 1, # all the candidates are singletons. But. later on (see line 54) # the candidates can grow to combinations of things. # Each round prunes the candiates so that only the better candiadtes # surive to round+1. # The final output are the candidates of the last round. # ---------------------------------------------------- # % class: yes seed: 1 round: 1 max: 0 lives: 5 # # % candidate: yes,age=child # % candidate: yes,age=child,class=1st # % candidate: yes,age=child,sex=female # % candidate: yes,class=1st # % candidate: yes,class=1st,class=2nd # % candidate: yes,class=1st,sex=female # % candidate: yes,class=2nd # % candidate: yes,class=2nd,sex=female # % candidate: yes,sex=female # # [0.022196665,yes,class=1st,sex=female]. # [0.030848353,yes,class=2nd]. # [0.05809188,yes,class=1st]. # [0.12507705,yes,sex=female]. # [0.13519168,yes,class=1st,class=2nd]. # # ---------------------------------------------------- # % class: yes seed: 1 round: 2 max: 0.13519168 lives: 5 # # % candidate: yes,class=1st # % candidate: yes,class=1st,class=2nd # % candidate: yes,class=1st,class=2nd,sex=female # % candidate: yes,class=1st,sex=female # % candidate: yes,class=2nd # % candidate: yes,class=2nd,sex=female # % candidate: yes,sex=female # # [0.022196665,yes,class=1st,sex=female]. # [0.030848353,yes,class=2nd]. # [0.056260989,yes,class=1st,class=2nd,sex=female]. # [0.05809188,yes,class=1st]. # [0.12507705,yes,sex=female]. # [0.13519168,yes,class=1st,class=2nd]. # # ---------------------------------------------------- # % class: yes seed: 1 round: 3 max: 0.13519168 lives: 4 # # % candidate: yes,class=1st # % candidate: yes,class=1st,class=2nd # % candidate: yes,class=1st,class=2nd,sex=female # % candidate: yes,class=1st,sex=female # % candidate: yes,class=2nd # % candidate: yes,class=2nd,sex=female # % candidate: yes,sex=female # # [0.022196665,yes,class=1st,sex=female]. # [0.030848353,yes,class=2nd]. # [0.056260989,yes,class=1st,class=2nd,sex=female]. # [0.05809188,yes,class=1st]. # [0.12507705,yes,sex=female]. # [0.13519168,yes,class=1st,class=2nd]. # # ---------------------------------------------------- # % class: yes seed: 1 round: 4 max: 0.13519168 lives: 3 # # % candidate: yes,class=1st # % candidate: yes,class=1st,class=2nd # % candidate: yes,class=1st,class=2nd,sex=female # % candidate: yes,class=1st,sex=female # % candidate: yes,class=2nd # % candidate: yes,class=2nd,sex=female # % candidate: yes,sex=female # # [0.022196665,yes,class=1st,sex=female]. # [0.030848353,yes,class=2nd]. # [0.056260989,yes,class=1st,class=2nd,sex=female]. # [0.05809188,yes,class=1st]. # [0.12507705,yes,sex=female]. # [0.13519168,yes,class=1st,class=2nd]. # # ---------------------------------------------------- # % class: yes seed: 1 round: 5 max: 0.13519168 lives: 2 # # % candidate: yes,class=1st # % candidate: yes,class=1st,class=2nd # % candidate: yes,class=1st,class=2nd,sex=female # % candidate: yes,class=1st,sex=female # % candidate: yes,class=2nd # % candidate: yes,class=2nd,sex=female # % candidate: yes,sex=female # # [0.022196665,yes,class=1st,sex=female]. # [0.030848353,yes,class=2nd]. # [0.056260989,yes,class=1st,class=2nd,sex=female]. # [0.05809188,yes,class=1st]. # [0.12507705,yes,sex=female]. # [0.13519168,yes,class=1st,class=2nd]. # # ---------------------------------------------------- # % class: yes seed: 1 round: 6 max: 0.13519168 lives: 1 # # % candidate: yes,class=1st # % candidate: yes,class=1st,class=2nd # % candidate: yes,class=1st,class=2nd,sex=female # % candidate: yes,class=1st,sex=female # % candidate: yes,class=2nd # % candidate: yes,class=2nd,sex=female # % candidate: yes,sex=female # # [0.022196665,yes,class=1st,sex=female]. # [0.030848353,yes,class=2nd]. # [0.056260989,yes,class=1st,class=2nd,sex=female]. # [0.05809188,yes,class=1st]. # [0.12507705,yes,sex=female]. # [0.13519168,yes,class=1st,class=2nd]. # # ---------------------------------------------------- # % class: yes seed: 1 round: 7 max: 0.13519168 lives: 0 # # % candidate: yes,class=1st # % candidate: yes,class=1st,class=2nd # % candidate: yes,class=1st,class=2nd,sex=female # % candidate: yes,class=1st,sex=female # % candidate: yes,class=2nd # % candidate: yes,sex=female # # [0.022196665,yes,class=1st,sex=female]. # [0.030848353,yes,class=2nd]. # [0.056260989,yes,class=1st,class=2nd,sex=female]. # [0.05809188,yes,class=1st]. # [0.12507705,yes,sex=female]. # [0.13519168,yes,class=1st,class=2nd]. BEGIN { Beam=10; # Only the top (say) 10 candidates survive to the next round Dull=0.1; # Ignore candidates with score < Dull*MaxScore Eval1=5; Eval2=1; Inf=10^32; Lives=5; # If no improvement after five rounds, give up Max=1 # If (1 or 0) try to (max or min) the score More = 1.02; # Improvement means at least a 2% growth OverFitted=3; # When do we prune a rule that matches on too few instances? Pinch = 1/1000; # Add a random number of up to "Pinch" to each score Samples=20; # Pick this number of pairs of candidates from the last round Seed=1 # Random number see. Skip = 0.33 Verbose=1 # Verbose = 0 means silence CONVFMT="%.8g"; # Increase the string size for array contents so we can see the Pinch IGNORECASE=1; SUBSEP = "="; _ = SUBSEP OFS=" _ "; C="," } ## -------------------------------------------------- #Data entry. Pretty routine stuff. /@attribute/ {Name[++Name[0]]=$2; Name[$2] = Name[0]} {gsub(/[ \t]*/,"")} # no blanks {gsub(/%.*/,"")} # no comments /^$/ {next} # no blank likes /@data/ {In=1;FS=","; srand(Seed)} /@/ {next} In { Bin = rand() > Skip Rows[Bin]++; train(Bin,Rows[Bin],Data,U) } END {learn(1,Rows,Data,U) } function train(bin,row,d,u, what,i) { if (! Max) $NF *= -1 for(i=1;i<=NF;i++) { if ($i == "?") # skip missing values continue; what = Name[i] # what is the name of column i? d[bin,row,what]=$i # raw data storage u[what,$i]=1 # remember all the unique values } } # Now we can begin. Try learning rules for each hypothesis. function learn(bin,rows,data,u, r,class) { ranges(u,r) round0(bin,counts,rows,data,which0) oo(which0,"which0learn","-n -k 5") rounds(1,(Max ? -1* Inf : Inf) ,Lives,which0,rows,data,which) } for(class in k) learn1(bin,class,all,h,rows,data,f,u,r) } # In round0, offer a rough ranking of # the ranges. In subequent rounds, randomly select and combine ranges # from prior randoms. function learn1(bin,class,all,h,rows,data,f,u,r, which0,which1) { print "\n" class Eval=Eval1; if (Verbose) print "" round0(bin,class,all,rows,f,u,h,which0); # make some initial guess rounds(bin,class,1,0,Lives,which0,rows,data,f,r,u,which1) #o(which1,"which1") Eval=Eval2; test(bin,class,which1,rows,data,f) } function ranges(u,r, i,tmp) { for(i in u) { split(i,tmp,_) r[tmp[1]]++ } } # In round one, score by b^2/(b+r); i.e. things more likely in # the target class than otherwise (with some support weighting) function round0(bin,class,all,rows,f,u,h,which, some,i,j,b,r,s,memo,score) { for(i in u) { some = f[bin _ class _ i] r = (all[bin _ i] - some)/(rows[bin] - h[bin _ class] + Pinch) b = some / (h[bin _ class] + Pinch) #print "i " i " some " some " r " r " b " b if (b > r) { j = class "," i s = b^2/(b+r) + rand()*Pinch memo[s] = j score[j]= s }} chop(score,memo,which) # prune the dull candidates } # Given some score[key]=number and memo[number]=key, # sort the scores and return the top Beam # number of keys, pruning all keys less than # Dull times the max score. function chop(score0,memo,out, score,n,i) { n=asort(score0,score) for(i=n;i>=1;i--) { if (score[i] <= score[n]*Dull) break; if (i <= n - Beam) break out[memo[score[i]]] = score[i] } } # In subsequent rounds one, score all the candidates # by running that combination over the data (see the "score" # function. Note the "score" paramter that caches prior # calcuations of the score. This speeds up the code by # a factor of four (for large data sets). function rounds(bin,class,round, max0,lives,which0,rows,data,f,r,u,out,score, \ ideas,max,i,sample,which1,s,memo,which2) { if (round == 1) max=0 else { # terminate if we have stopped improving max = most(which0) lives = (max > (max0*More)) ? Lives : lives - 1 if(lives < 0) { # if termination, copy input to out for(i in which0) out[i] = which0[i] return max } } if (Verbose) { print "% bin " bin " class: " class " seed: " Seed " verbose: " Verbose\ " round: " round " max: " max " lives: " lives } normalize(which0) # make all the counts n= 1..100 explode(which0,sample) # copy items n times twos(class,r,sample,Samples,which1) # pick items at random from that sample for(i in which0) # add in the last rounds' ideas which1[i] = i; beliefsInject(class,which1) if (Verbose > 1) values(which1,"candidate") for(i in which1) { # score the new picks and the last rounds's ideas s = (i in score) ? score[i] : score(bin,class,i,rows,data,f) + rand()*Pinch memo[s] = i score[i] = s } if (Verbose > 1) beliefsReview(class,score) chop(score,memo,which2) # prune the dull candidates if (Verbose > 1) o(which2,"scores","-n -k 1") return rounds(bin,class,round+1,max,lives,which2,rows,data,f,u,r,out,score) } function beliefsInject(class, which, ideas,i) { split(Beliefs,ideas,";") for(i in ideas) which[class "," ideas[i]] = class "," ideas[i] } function beliefsReview(class, score, ideas,i) { split(Beliefs,ideas,";") for(i in ideas) print "##score " ideas[i] " " score[class "," ideas[i]] } ## ----------------------------------------- # Randomly pick pairs and combine them. Note that, # in the following code, the picks come from "sample" # where an item may be repeated many times (so things # that are often repeated are more likely to be picked). # "n" times, pick two things from "sample" # and store them in "sampled". note htat # the combined rules all start with the target class function twos(class,r,sample,n,sampled, pair) { while(n--) { pair= two(class,r,sample) sampled[pair]=pair } } # Pick two things at random. Try not # to pick the same thing twice. Return # the combination of the two things you pick. function two(class,r,sample, tries, this, that,new) { this = one(sample) if(tries == 9) # nine lives return this that = one(sample) if (this == that) return two(class,r,sample,tries + 1) else { new = combine(class,r,this,that) return new ? new : two(class,r,sample,tries + 1) } } # combine two rules. don't repeat any ranges. # sort them so that all the ranges of the same # feature fall together. Note that the frst item in # a rule is the target class. prune those entries # away (so they don not repeat themselves). function combine(class,r,this,that, k,v,r0,n,i,used,tmp,tests,pair,ignore,out) { sub(/^[^,]*,/,"",this) # get rid of initial class symbol sub(/^[^,]*,/,"",that) # (its just repeated info anyway) split(this "," that,tmp,",") n=asort(tmp) # sort the combined attr=val pairs # (this makes subsequent equality tests easy- # just use "==") for(i=1;i<=n;i++) if (!used[tmp[i]]) { # never use the same pair twice used[tmp[i]] = 1 # remember that we've used this pair. split(tmp[i],pair,_) # the string "attr=val" is a pair k=pair[1] # with key "k" v=pair[2] # and value "v" push(tests,k) push(tests,v) r0[k]++ # remember how many times we've seen r0 if( r0[k] >= r[k] ) # if we've seen it too often ignore[k] = 1 # then mark it "ignore"d } for(i=1; i<=tests[0]; i += 2) # build the rule if (!ignore[tests[i]]) # skip the ones we are ignoring out = concat(out, tests[i] _ tests[i+1], C) return out ? class "," out : "" } ## --------------------------------- ## score a rule by finding its matching rows in data. # test a rule by scoring it function test(bin,class,rules,rows,data,f, com) { com="sort -t _ -n -k 2 #" rand() for(rule in rules) if (rule) print rule,rules[rule],score(0,class,rule,rows,data,f) | com close(com) } function score(bin,class,rule,rows,data,f, goal,row, col,a,b,c,d,triggered,pd,pf,prec,acc,support,s,fits) { s=a=b=c=d=Pinch # stop divide by zero errors goal=Name[Name[0]] for(row=1;row<=rows[bin];row++) { triggered = matched(row,bin,data,rule) if (data[bin,row,goal] == class) { if (triggered) {d++} else {b++} } else { if (triggered) {c++} else {a++} } } fits = c + d pd = d/(b+d) pf = a/(a+c) prec = d/(c+d) acc = (a+d)/(a+b+c+d) lift = (d/(c+d)) / ((b+d)/(a+b+c+d )) support = (c+d)/(a+b+c+d) s = score1(pd,pf,prec,acc,support,lift,fits) return s } function score1(pd,pf,prec,acc,support,lift,fits) { if (fits <= OverFitted) return 0 if (Eval==1) return acc if (Eval==2) return 2 * pd * prec/(pd+prec) if (Eval==3) return 2 * pd * pf/(pd+pf) if (Eval==4) return support * 2 * pd * pf/(pd+pf) if (Eval==5) return support * acc if (Eval==6) return support * 2 * pd * prec/(pd+prec) if (Eval==7) return lift if (Eval==8) return lift*support # which btw, is recall return support * 2 * pd * prec/(pd+prec) } # Given "this" of the form "f1_v1,f2_v2,...." see if "row" matches "this". # Assumes that disjunctions are modeled as configuous values from the # same feature (this is gaurenteed by "combine"). Hence, whenever # we move to a new feature, we need to check that at least of the values # mentioned with the old feature was found. function matched(row,bin,data,this, col,n,goals,pair,f0,f,status) { n=split(this,goals,",") for(col=2;col<=n;col++) { split(goals[col],pair,_) f = pair[1] status[f] += data[bin,row,f] == pair[2] if (f0 && (f != f0) && !status[f0]) return 0 f0 = f } return status[f] } ## -------------------------------------- ## interesting utils # Given an array a[i]=n, # fill up "out" with "n" number # of "i". This creates a "sample" of # things, from which we can pick randomly # biased by the relative frequencies of "n". # The total size of the sample is stored # in "sample[0]" function explode(a, out,i,j) { for(i in a) for(j=1;j<=a[i];j++) out[++out[0]] = i } # Pick any item at random from "sample". # Assumes that the same size is in array # element "sample[0]" function one(sample, any) { any = int(rand()* sample[0])+1 return sample[any] } ## -------------------------------------- ## boring utils # Given an array a[i]=num, normalize # all the numbers to integers 0..100 function normalize(a, i,sum) { for(i in a) sum += a[i] for(i in a) a[i] = int(100*a[i]/sum) } # combine an feature/ range function fv(f,v) { return f _ v } # find the max item in an array function most(a, i,max) { max = -1000000000 for(i in a) if (a[i] > max) max = a[i]; return max } # push function push(a,x) { return a[++a[0]]=x } # string append, with seperator function concat(s1,s2,sep) { return s1 ? s1 sep s2 : s2 } # print array values function values(a,s,what, i,com) { print "" com = what ? "sort " what : "sort" for(i in a) print "% " s": " a[i] | com; close(com) } # print an array, sorted by "what" function o(a,s,what, i,com) { print "" com = what ? "sort -t, " what : "sort -t, " for(i in a) print s"["a[i] ","i"]." | com; close(com) } function oo(a,s, i){ print "" for(i in a) print s"[" i "] = " a[i] }