#!/sw/bin/gawk -f ########################################################################## # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . ########################################################################## # whichn2 : a stochastic anytime rule-based optimizer # (c) Tim Menzies (tim@menzies.us) 2010, LGLP 3.0 # This program builds rules by ranking ideas, then repeatedly building new ideas # by picking # and combining two old ideas (favoring those with higher ranks). New # ideas generated in this way are ranked and thrown back into the same pot as # the old ideas so, if they are any good, they might be picked and extended # in subsequent rounds. Alternatively, if the new idea stinks, it gets buried # by the better ideas and is ignore. # One important aspect of the following is that the scoring routines for # ideas are completely seperate from the rest of the code (see the "score1" # function). Hence, it is a simple # matter to try our different search biases. # e.g. This call produces the following output. # gawk -f which2n.awk titanic.arff # In the following, the "candidates" are ideas that look promsing # and "score" ranks the candidates. If the max score does not improve # from the last round, then "lives" decreases. # Each round tries random combinations of the stuff from prior rounds # (favoring those things with higher scores). Hence, at round 1, # all the candidates are singletons. But. later on (see line 54) # the candidates can grow to combinations of things. # Each round prunes the candiates so that only the better candiadtes # surive to round+1. BEGIN { Seed=1 # Random number see. More = 1.02; # Improvement means at least a 2% growth Lives=5; # If no improvement after five rounds, give up Dull=0.1; # Ignore candidates with score < Dull*MaxScore Beam=10; # Only the top (say) 10 candidates survive to the next round Samples=20; # Pick this number of pairs of candidates from the last round Pinch = 1/1000; # Add a random number of up to "Pinch" to each score OverFitted=3; # When do we prune a rule that matches on too few instances? CONVFMT="%.8g"; # Increase the string size for array contents so we can see the Pinch IGNORECASE=1; _ = SUBSEP; C="," } ## -------------------------------------------------- #Data entry. Pretty routine stuff. /@attribute/ {Name[++Name[0]]=$2} {gsub(/[ \t]*/,"")} # no blanks {gsub(/#.*/,"")} # no comments /^$/ {next} # no blank likes /@data/ {In=1; FS=","; srand(Seed)} /@/ {next} In {Rows++; train(Rows,Data,Counts)} function train(rows,d,c, i) { for(i=1;i<=NF;i++) { if ($i == "?") continue; d[rows,i]=$i c[i,$i] += $NF } } # Now we can begin. In round0, offer a rough ranking of # the ranges. In subequent rounds, randomly select and combine ranges # from prior randoms. END { round0(Counts,Hot0); # make some initial guess rounds(1,0,Lives,Hot0,Rows,Data,Out) } # In round one, score by b^/(b+r) function round0(c,hot, i,s,memo,score) { for(i in c) { s = c[i] memo[s] = i score[i]= s } chop(score,memo,hot) # prune the dull candidates } # Given some score[key]=number and memo[number]=key, # sort the scores and return the top Beam # number of keys, pruning all keys less than # Dull times the max score. function chop(score0,memo,out, score,n,i) { n=asort(score0,score) for(i=n;i>=1;i--) { if (score[i] <= score[n]*Dull) break; if (i <= n - Beam) break out[memo[score[i]]] = score[i] } } # In subsequent rounds one, score all the candidates # by running that combination over the data (see the "score" # function. Note the "score" paramter that caches prior # calcuations of the score. This speeds up the code by # a factor of four (for large data sets). function rounds(round, max0,lives,hot0,rows,data,out,score, \ max,i,sample,hot1,s,memo,hot2) { if (round == 1) max=0 else { # terminate if we have stopped improving max = most(hot0) lives = (max > (max0*More)) ? Lives : lives - 1 if(lives < 0) { # if termination, copy input to out for(i in hot0) out[i] = hot0[i] return max } } print "\n---| round: " round " seed: " Seed " |-----------------------------------" print "max : " max "\nlives : " lives normalize(hot0) # make all the counts n= 1..100 explode(hot0,sample) # copy items n times twos(sample,Samples,hot1) # pick items at random from that sample for(i in hot0) # add in the last rounds' ideas hot1[i] = i; values(hot1,"candidate") for(i in hot1) { # score the new picks and the last rounds's ideas s = (i in score) ? score[i] : score(i,rows,data) + rand()*Pinch memo[s] = i score[i] = s } chop(score,memo,hot2) # prune the dull candidates o(hot2,"score","-n -k 5") return rounds(round+1,max,lives,hot2,rows,data,out,score) } ## ----------------------------------------- # Randomly pick pairs and combine them. Note that, # in the following code, the picks come from "sample" # where an item may be repeated many times (so things # that are often repeated are more likely to be picked). # "n" times, pick two things from "sample" # and store them in "sampled" function twos(sample,n,sampled, pair) { while(n--) { pair= two(sample) sampled[pair]=pair } } # Pick two things at random. Try not # to pick the same thing twice. Return # the combination of the two things you pick. function two(sample, tries, this, that) { this = one(sample) if(tries == 9) # nine lives return this that = one(sample) if (this == that) return two(sample,tries + 1) else return combine(this,that) } # combine two rules. don't repeat any ranges. # sort them so that all the ranges of the same # feature fall together. function combine(this,that, used,tmp,out) { split(this "," that,tmp,",") n=asort(tmp) out = tmp[1] used[tmp[1]]=1 for(i=2;i<=n;i++) if (!used[tmp[i]]) { out = out "," tmp[i] used[tmp[i]] = 1 } return out } ## --------------------------------- ## score a rule by finding its matching rows in data. function score(i,rows,data, cols,row,triggered,fits,sum,support) { fits=sum=Pinch cols=Name[0] for(row=1;row<=rows;row++) { triggered = matched(row,cols,data,i) if (triggered) { fits++ sum += data[row,cols] } } support = fits/rows return score1(fits,support,sum) } function score1(fits,support,s) { if (fits <= OverFitted) return 0 if (Eval==1) return support * s return s } # Given "this" of the form "f1_v1,f2_v2,...." see if "row" matches "this". # Assumes that disjunctions are modeled as configuous values from the # same feature (this is gaurenteed by "combine"). Hence, whenever # we move to a new feature, we need to check that at least of the values # mentioned with the old feature was found. function matched(row,cols,data,this, col,n,goals,pair,f0,f,status) { n=split(this,goals,",") for(col=1;col<=n;col++) { split(goals[col],pair,_) f = pair[1] status[f] += data[row,f] == pair[2] if (f0 && (f != f0) && !status[f0]) return 0 f0 = f } return status[f] } ## -------------------------------------- ## interesting utils # Given an array a[i]=n, # fill up "out" with "n" number # of "i". This creates a "sample" of # things, from which we can pick randomly # biased by the relative frequencies of "n". # The total size of the sample is stored # in "sample[0]" function explode(a, out,i,j) { for(i in a) for(j=1;j<=a[i];j++) out[++out[0]] = i } # Pick any item at random from "sample". # Assumes that the same size is in array # element "sample[0]" function one(sample, any) { any = int(rand()* sample[0])+1 return sample[any] } ## -------------------------------------- ## boring utils # Given an array a[i]=num, normalize # all the numbers to integers 0..100 function normalize(a, i,sum) { for(i in a) sum += a[i] for(i in a) a[i] = int(100*a[i]/sum) } # combine an feature/ range function fv(f,v) { return f _ v } # find the max item in an array function most(a, i,max) { max = -1000000000 for(i in a) if (a[i] > max) max = a[i]; return max } # print array values function values(a,s,what, i,j,com) { print "" what = what ? what : "" com = "sort " what for(i in a) { j=a[i]; gsub(_,",",j); print s": " j | com; } close(com) } # print an array, sorted by "what" function o(a,s,what, i,j,com) { print "" what = what ? what : "" com = "sort " what for(i in a) { j=i; gsub(_,",",j); print s"[ "j" ] = " a[i] | com; } close(com) }