#!/sw/bin/gawk -f ########################################################################## # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . ########################################################################## # which2 : a stochastic anytime rule learner # (c) Tim Menzies (tim@menzies.us) 2010, LGLP 3.0 # This program builds rules by ranking ideas, then repeatedly building new ideas # by picking # and combining two old ideas (favoring those with higher ranks). New # ideas generated in this way are ranked and thrown back into the same pot as # the old ideas so, if they are any good, they might be picked and extended # in subsequent rounds. Alternatively, if the new idea stinks, it gets buried # by the better ideas and is ignore. # One important aspect of the following is that the scoring routines for # ideas are completely seperate from the rest of the code (see the "score1" # function). Hence, it is a simple # matter to try our different search biases. # e.g. This call produces the following output. # gawk -f which2n.awk servo.arff # In the following, the "candidates" are ideas that look promsing # and "score" ranks the candidates. If the max score does not improve # from the last round, then "lives" decreases. # Each round tries random combinations of the stuff from prior rounds # (favoring those things with higher scores). Hence, initially, # all the candidates are singletons. But. later on # the candidates can grow to combinations of things. # Each round prunes the candiates so that only the better candiadtes # surive to round+1. # The final output are the candidates of the last round. # ---------------------------------------------------- # % max: 0 seed: 1 round: 1 max: 0 lives: 5 # # % candidate: 1,motor=A # % candidate: 1,motor=A,motor=C # % candidate: 1,motor=A,pgain=3 # % candidate: 1,motor=A,vgain=1 # % candidate: 1,motor=A,vgain=2 # % candidate: 1,motor=B # % candidate: 1,motor=B,screw=B # % candidate: 1,motor=B,screw=C # % candidate: 1,motor=B,vgain=2 # % candidate: 1,motor=C # % candidate: 1,pgain=3 # % candidate: 1,pgain=3,screw=A # % candidate: 1,pgain=3,screw=B # % candidate: 1,pgain=3,vgain=1 # % candidate: 1,pgain=3,vgain=2 # % candidate: 1,pgain=4 # % candidate: 1,pgain=4,vgain=2 # % candidate: 1,screw=A # % candidate: 1,screw=A,screw=B # % candidate: 1,screw=B # % candidate: 1,screw=B,vgain=1 # % candidate: 1,screw=B,vgain=2 # % candidate: 1,screw=C # % candidate: 1,screw=C,vgain=1 # % candidate: 1,vgain=1 # % candidate: 1,vgain=2 # # 60.551088,1,motor=B # 63.401463,1,motor=A # 73.101076,1,pgain=3,vgain=1 # 74.276395,1,screw=A # 84.839046,1,vgain=1 # 87.901248,1,pgain=3,vgain=2 # 100.38861,1,vgain=2 # 113.56437,1,motor=A,motor=C # 122.99598,1,screw=A,screw=B # 161.00124,1,pgain=3 # # ---------------------------------------------------- # % max: 161.00124 seed: 1 round: 2 max: 161.00124 lives: 5 # # % candidate: 1,motor=A # % candidate: 1,motor=A,motor=C # % candidate: 1,motor=A,motor=C,pgain=3 # % candidate: 1,motor=A,motor=C,screw=A # % candidate: 1,motor=A,screw=A # % candidate: 1,motor=A,screw=A,screw=B # % candidate: 1,motor=B # % candidate: 1,motor=B,pgain=3 # % candidate: 1,motor=B,screw=A,screw=B # % candidate: 1,motor=B,vgain=1 # % candidate: 1,pgain=3 # % candidate: 1,pgain=3,screw=A # % candidate: 1,pgain=3,screw=A,screw=B # % candidate: 1,pgain=3,vgain=1 # % candidate: 1,pgain=3,vgain=1,vgain=2 # % candidate: 1,pgain=3,vgain=2 # % candidate: 1,screw=A # % candidate: 1,screw=A,screw=B # % candidate: 1,screw=A,screw=B,vgain=1 # % candidate: 1,screw=A,screw=B,vgain=2 # % candidate: 1,screw=A,vgain=2 # % candidate: 1,vgain=1 # % candidate: 1,vgain=2 # # 74.276395,1,screw=A # 78.801562,1,motor=A,motor=C,pgain=3 # 83.001614,1,pgain=3,screw=A,screw=B # 84.839046,1,vgain=1 # 87.901248,1,pgain=3,vgain=2 # 100.38861,1,vgain=2 # 113.56437,1,motor=A,motor=C # 122.99598,1,screw=A,screw=B # 161.00117,1,pgain=3,vgain=1,vgain=2 # 161.00124,1,pgain=3 # # ---------------------------------------------------- # % max: 161.00124 seed: 1 round: 3 max: 161.00124 lives: 4 # # % candidate: 1,motor=A,motor=C # % candidate: 1,motor=A,motor=C,pgain=3 # % candidate: 1,motor=A,motor=C,pgain=3,screw=A,screw=B # % candidate: 1,motor=A,motor=C,screw=A # % candidate: 1,pgain=3 # % candidate: 1,pgain=3,screw=A # % candidate: 1,pgain=3,screw=A,screw=B # % candidate: 1,pgain=3,screw=A,screw=B,vgain=1 # % candidate: 1,pgain=3,screw=A,screw=B,vgain=1,vgain=2 # % candidate: 1,pgain=3,screw=A,screw=B,vgain=2 # % candidate: 1,pgain=3,screw=A,vgain=1,vgain=2 # % candidate: 1,pgain=3,screw=A,vgain=2 # % candidate: 1,pgain=3,vgain=1,vgain=2 # % candidate: 1,pgain=3,vgain=2 # % candidate: 1,screw=A # % candidate: 1,screw=A,screw=B # % candidate: 1,vgain=1 # % candidate: 1,vgain=1,vgain=2 # % candidate: 1,vgain=2 # # 83.001614,1,pgain=3,screw=A,screw=B # 83.001702,1,pgain=3,screw=A,screw=B,vgain=1,vgain=2 # 84.839046,1,vgain=1 # 87.901248,1,pgain=3,vgain=2 # 100.38861,1,vgain=2 # 113.56437,1,motor=A,motor=C # 122.99598,1,screw=A,screw=B # 161.00117,1,pgain=3,vgain=1,vgain=2 # 161.00124,1,pgain=3 # 185.22661,1,vgain=1,vgain=2 # # ---------------------------------------------------- # % max: 185.22661 seed: 1 round: 4 max: 185.22661 lives: 5 # # % candidate: 1,motor=A,motor=C # % candidate: 1,motor=A,motor=C,pgain=3 # % candidate: 1,motor=A,motor=C,pgain=3,screw=A,screw=B # % candidate: 1,motor=A,motor=C,pgain=3,screw=A,screw=B,vgain=1,vgain=2 # % candidate: 1,motor=A,motor=C,vgain=1 # % candidate: 1,pgain=3 # % candidate: 1,pgain=3,screw=A,screw=B # % candidate: 1,pgain=3,screw=A,screw=B,vgain=1,vgain=2 # % candidate: 1,pgain=3,vgain=1 # % candidate: 1,pgain=3,vgain=1,vgain=2 # % candidate: 1,pgain=3,vgain=2 # % candidate: 1,screw=A,screw=B # % candidate: 1,screw=A,screw=B,vgain=2 # % candidate: 1,vgain=1 # % candidate: 1,vgain=1,vgain=2 # % candidate: 1,vgain=2 # # 83.001614,1,pgain=3,screw=A,screw=B # 83.001702,1,pgain=3,screw=A,screw=B,vgain=1,vgain=2 # 84.839046,1,vgain=1 # 87.901248,1,pgain=3,vgain=2 # 100.38861,1,vgain=2 # 113.56437,1,motor=A,motor=C # 122.99598,1,screw=A,screw=B # 161.00117,1,pgain=3,vgain=1,vgain=2 # 161.00124,1,pgain=3 # 185.22661,1,vgain=1,vgain=2 # # ---------------------------------------------------- # % max: 185.22661 seed: 1 round: 5 max: 185.22661 lives: 4 # # % candidate: 1,motor=A,motor=C # % candidate: 1,motor=A,motor=C,pgain=3,screw=A,screw=B # % candidate: 1,motor=A,motor=C,pgain=3,vgain=1,vgain=2 # % candidate: 1,motor=A,motor=C,vgain=1 # % candidate: 1,motor=A,motor=C,vgain=1,vgain=2 # % candidate: 1,pgain=3 # % candidate: 1,pgain=3,screw=A,screw=B # % candidate: 1,pgain=3,screw=A,screw=B,vgain=1,vgain=2 # % candidate: 1,pgain=3,screw=A,screw=B,vgain=2 # % candidate: 1,pgain=3,vgain=1 # % candidate: 1,pgain=3,vgain=1,vgain=2 # % candidate: 1,pgain=3,vgain=2 # % candidate: 1,screw=A,screw=B # % candidate: 1,screw=A,screw=B,vgain=1,vgain=2 # % candidate: 1,vgain=1 # % candidate: 1,vgain=1,vgain=2 # % candidate: 1,vgain=2 # # 84.839046,1,vgain=1 # 87.901248,1,pgain=3,vgain=2 # 90.726125,1,motor=A,motor=C,vgain=1,vgain=2 # 99.7264,1,screw=A,screw=B,vgain=1,vgain=2 # 100.38861,1,vgain=2 # 113.56437,1,motor=A,motor=C # 122.99598,1,screw=A,screw=B # 161.00117,1,pgain=3,vgain=1,vgain=2 # 161.00124,1,pgain=3 # 185.22661,1,vgain=1,vgain=2 # # ---------------------------------------------------- # % max: 185.22661 seed: 1 round: 6 max: 185.22661 lives: 3 # # % candidate: 1,motor=A,motor=C # % candidate: 1,motor=A,motor=C,pgain=3,vgain=1,vgain=2 # % candidate: 1,motor=A,motor=C,screw=A,screw=B,vgain=1,vgain=2 # % candidate: 1,motor=A,motor=C,vgain=1 # % candidate: 1,motor=A,motor=C,vgain=1,vgain=2 # % candidate: 1,pgain=3 # % candidate: 1,pgain=3,screw=A,screw=B,vgain=1,vgain=2 # % candidate: 1,pgain=3,vgain=1,vgain=2 # % candidate: 1,pgain=3,vgain=2 # % candidate: 1,screw=A,screw=B # % candidate: 1,screw=A,screw=B,vgain=1 # % candidate: 1,screw=A,screw=B,vgain=1,vgain=2 # % candidate: 1,vgain=1 # % candidate: 1,vgain=1,vgain=2 # % candidate: 1,vgain=2 # # 84.839046,1,vgain=1 # 87.901248,1,pgain=3,vgain=2 # 90.726125,1,motor=A,motor=C,vgain=1,vgain=2 # 99.7264,1,screw=A,screw=B,vgain=1,vgain=2 # 100.38861,1,vgain=2 # 113.56437,1,motor=A,motor=C # 122.99598,1,screw=A,screw=B # 161.00117,1,pgain=3,vgain=1,vgain=2 # 161.00124,1,pgain=3 # 185.22661,1,vgain=1,vgain=2 # # ---------------------------------------------------- # % max: 185.22661 seed: 1 round: 7 max: 185.22661 lives: 2 # # % candidate: 1,motor=A,motor=C # % candidate: 1,motor=A,motor=C,pgain=3,vgain=1,vgain=2 # % candidate: 1,motor=A,motor=C,screw=A,screw=B,vgain=1,vgain=2 # % candidate: 1,motor=A,motor=C,vgain=1,vgain=2 # % candidate: 1,pgain=3 # % candidate: 1,pgain=3,screw=A,screw=B,vgain=1,vgain=2 # % candidate: 1,pgain=3,screw=A,screw=B,vgain=2 # % candidate: 1,pgain=3,vgain=1 # % candidate: 1,pgain=3,vgain=1,vgain=2 # % candidate: 1,pgain=3,vgain=2 # % candidate: 1,screw=A,screw=B # % candidate: 1,screw=A,screw=B,vgain=1 # % candidate: 1,screw=A,screw=B,vgain=1,vgain=2 # % candidate: 1,screw=A,screw=B,vgain=2 # % candidate: 1,vgain=1 # % candidate: 1,vgain=1,vgain=2 # % candidate: 1,vgain=2 # # 84.839046,1,vgain=1 # 87.901248,1,pgain=3,vgain=2 # 90.726125,1,motor=A,motor=C,vgain=1,vgain=2 # 99.7264,1,screw=A,screw=B,vgain=1,vgain=2 # 100.38861,1,vgain=2 # 113.56437,1,motor=A,motor=C # 122.99598,1,screw=A,screw=B # 161.00117,1,pgain=3,vgain=1,vgain=2 # 161.00124,1,pgain=3 # 185.22661,1,vgain=1,vgain=2 # # ---------------------------------------------------- # % max: 185.22661 seed: 1 round: 8 max: 185.22661 lives: 1 # # % candidate: 1,motor=A,motor=C # % candidate: 1,motor=A,motor=C,pgain=3 # % candidate: 1,motor=A,motor=C,vgain=1,vgain=2 # % candidate: 1,pgain=3 # % candidate: 1,pgain=3,screw=A,screw=B # % candidate: 1,pgain=3,screw=A,screw=B,vgain=1,vgain=2 # % candidate: 1,pgain=3,vgain=1,vgain=2 # % candidate: 1,pgain=3,vgain=2 # % candidate: 1,screw=A,screw=B # % candidate: 1,screw=A,screw=B,vgain=1 # % candidate: 1,screw=A,screw=B,vgain=1,vgain=2 # % candidate: 1,screw=A,screw=B,vgain=2 # % candidate: 1,vgain=1 # % candidate: 1,vgain=1,vgain=2 # % candidate: 1,vgain=2 # # 84.839046,1,vgain=1 # 87.901248,1,pgain=3,vgain=2 # 90.726125,1,motor=A,motor=C,vgain=1,vgain=2 # 99.7264,1,screw=A,screw=B,vgain=1,vgain=2 # 100.38861,1,vgain=2 # 113.56437,1,motor=A,motor=C # 122.99598,1,screw=A,screw=B # 161.00117,1,pgain=3,vgain=1,vgain=2 # 161.00124,1,pgain=3 # 185.22661,1,vgain=1,vgain=2 # # ---------------------------------------------------- # % max: 185.22661 seed: 1 round: 9 max: 185.22661 lives: 0 # # % candidate: 1,motor=A,motor=C # % candidate: 1,motor=A,motor=C,pgain=3 # % candidate: 1,motor=A,motor=C,pgain=3,vgain=1,vgain=2 # % candidate: 1,motor=A,motor=C,pgain=3,vgain=2 # % candidate: 1,motor=A,motor=C,screw=A,screw=B,vgain=1,vgain=2 # % candidate: 1,motor=A,motor=C,vgain=1 # % candidate: 1,motor=A,motor=C,vgain=1,vgain=2 # % candidate: 1,motor=A,motor=C,vgain=2 # % candidate: 1,pgain=3 # % candidate: 1,pgain=3,screw=A,screw=B,vgain=1,vgain=2 # % candidate: 1,pgain=3,vgain=1,vgain=2 # % candidate: 1,pgain=3,vgain=2 # % candidate: 1,screw=A,screw=B # % candidate: 1,screw=A,screw=B,vgain=1 # % candidate: 1,screw=A,screw=B,vgain=1,vgain=2 # % candidate: 1,vgain=1 # % candidate: 1,vgain=1,vgain=2 # % candidate: 1,vgain=2 # # 84.839046,1,vgain=1 # 87.901248,1,pgain=3,vgain=2 # 90.726125,1,motor=A,motor=C,vgain=1,vgain=2 # 99.7264,1,screw=A,screw=B,vgain=1,vgain=2 # 100.38861,1,vgain=2 # 113.56437,1,motor=A,motor=C # 122.99598,1,screw=A,screw=B # 161.00117,1,pgain=3,vgain=1,vgain=2 # 161.00124,1,pgain=3 # 185.22661,1,vgain=1,vgain=2 BEGIN { Max = 1 # If (1 or 0), try to (max or min) the score Seed=1 # Random number see. More = 1.02; # Improvement means at least a 2% growth Lives=5; # If no improvement after five rounds, give up Dull=0.1; # Ignore candidates with score < Dull*MaxScore Beam=10; # Only the top (say) 10 candidates survive to the next round Samples=20; # Pick this number of pairs of candidates from the last round Pinch = 1/1000; # Add a random number of up to "Pinch" to each score OverFitted=3; # When do we prune a rule that matches on too few instances? CONVFMT="%.8g"; # Increase the string size for array contents so we can see the Pinch IGNORECASE=1; SUBSEP = "="; _ = SUBSEP OFS=","; C="," Verbose=1 # Verbose = 0 means silence } ## -------------------------------------------------- #Data entry. Pretty routine stuff. /@attribute/ {Name[++Name[0]]=$2; Name[$2] = Name[0]} {gsub(/[ \t]*/,"")} # no blanks {gsub(/%.*/,"")} # no comments /^$/ {next} # no blank likes /@data/ {In=1;FS=","; srand(Seed)} /@/ {next} In {Rows++; train(Rows,Data,Counts)} END {learn(Rows,Data,Counts)} function train(rows,d,c, what,i) { for(i=1;i<=NF;i++) { if ($i == "?") continue; what = Name[i] d[rows,what]=$i c[what,$i] += $NF } } # In round0, offer a rough ranking of # the ranges. In subequent rounds, randomly select and combine ranges # from prior randoms. function learn(rows,data,counts, which0,which) { #oo(Data,"data"); round0(counts,which0) rounds(1,0,Lives,which0,rows,data,which) } # In round one, score using the class var; i.e. things more likely in # the target class than otherwise (with some support weighting) function round0(counts,which, j,i,s,memo,score) { for(i in counts) { s = Max ? counts[i] : -1*counts[i] j = Max "," i memo[s] = j score[j] = s } chop(score,memo,which) } # Given some score[key]=number and memo[number]=key, # sort the scores and return the top Beam # number of keys, pruning all keys less than # Dull times the max score. function chop(score0,memo,out, score,n,i) { n=asort(score0,score) for(i=n;i>=1;i--) { if (score[i] <= score[n]*Dull) break; if (i <= n - Beam) break out[memo[score[i]]] = score[i] } } # In subsequent rounds one, score all the candidates # by running that combination over the data (see the "score" # function. Note the "score" paramter that caches prior # calcuations of the score. This speeds up the code by # a factor of four (for large data sets). function rounds(round, max0,min,lives,which0,rows,data,out,score, \ max,i,sample,which1,s,memo,which2) { if (round == 1) max=0 else { # terminate if we have stopped improving max = Max ? most(which0) : least(which0) if (Max) { lives = (max > (max0*More)) ? Lives : lives - 1 } else { lives = (max < (max0/More)) ? Lives : lives - 1 } if(lives < 0) { # if termination, copy input to out for(i in which0) out[i] = which0[i] return max } } print "\n----------------------------------------------------" print "% max: " max " seed: " Seed \ " round: " round " best: " max " lives: " lives normalize(which0) # make all the counts n= 1..100 explode(which0,sample) # copy items n times twos(Max,sample,Samples,which1) # pick items at random from that sample for(i in which0) # add in the last rounds' ideas which1[i] = i; if (Verbose) values(which1,"candidate") for(i in which1) { # score the new picks and the last rounds's ideas s = (i in score) ? score[i] : score(i,rows,data) + rand()*Pinch memo[s] = i score[i] = s } chop(score,memo,which2) # prune the dull candidates if (Verbose) o(which2,"score","-t, -n -k 1") return rounds(round+1,max,lives,which2,rows,data,out,score) } ## ----------------------------------------- # Randomly pick pairs and combine them. Note that, # in the following code, the picks come from "sample" # where an item may be repeated many times (so things # that are often repeated are more likely to be picked). # "n" times, pick two things from "sample" # and store them in "sampled". note htat # the combined rules all start with the target class function twos(class,sample,n,sampled, pair) { while(n--) { pair= two(class,sample) sampled[pair]=pair } } # Pick two things at random. Try not # to pick the same thing twice. Return # the combination of the two things you pick. function two(class,sample, tries, this, that) { this = one(sample) if(tries == 9) # nine lives return this that = one(sample) if (this == that) return two(class,sample,tries + 1) else return combine(class,this,that) } # combine two rules. don't repeat any ranges. # sort them so that all the ranges of the same # feature fall together. Note that the frst item in # a rule is the target class. prune those entries # away (so they don not repeat themselves). function combine(class,this,that, n,i,used,tmp,out) { sub(/^[^,]*,/,"",this) sub(/^[^,]*,/,"",that) split(this "," that,tmp,",") n=asort(tmp) out=tmp[1] used[tmp[1]]=1 for(i=1;i<=n;i++) if (!used[tmp[i]]) { out = out "," tmp[i] used[tmp[i]] = 1 } return class "," out } ## --------------------------------- ## score a rule by finding its matching rows in data. function score(rule,rows,data, cols,row,s,triggered,fits,sum,support) { fits=sum=Pinch # stop divide by zero errors cols=Name[Name[0]] for(row=1;row<=rows;row++) { if(matched(row,data,rule)) { fits++ sum += data[row,cols] }} support = fits/rows s = score1(fits,support,sum) return Max ? s : -1*s } function score1(fits,support,sum) { if (fits <= OverFitted) return 0 if (Eval==1) return support * sum return sum } # Given "this" of the form "f1_v1,f2_v2,...." see if "row" matches "this". # Assumes that disjunctions are modeled as configuous values from the # same feature (this is gaurenteed by "combine"). Hence, whenever # we move to a new feature, we need to check that at least of the values # mentioned with the old feature was found. function matched(row,data,this, out,col,n,goals,pair,f0,f,status) { n=split(this,goals,",") for(col=2;col<=n;col++) { split(goals[col],pair,_) f = pair[1] status[f] += data[row,f] == pair[2] if (f0 && (f != f0) && !status[f0]) return 0 f0 = f } return status[f] } ## -------------------------------------- ## interesting utils # Given an array a[i]=n, # fill up "out" with "n" number # of "i". This creates a "sample" of # things, from which we can pick randomly # biased by the relative frequencies of "n". # The total size of the sample is stored # in "sample[0]" function explode(a, out,i,j) { for(i in a) for(j=1;j<=a[i];j++) out[++out[0]] = i } # Pick any item at random from "sample". # Assumes that the same size is in array # element "sample[0]" function one(sample, any) { any = int(rand()* sample[0])+1 return sample[any] } ## -------------------------------------- ## boring utils # Given an array a[i]=num, normalize # all the numbers to integers 0..100 function normalize(a, i,sum) { for(i in a) sum += a[i] for(i in a) a[i] = int(100*a[i]/sum) } # combine an feature/ range function fv(f,v) { return f _ v } # find the max item in an array function most(a, i,max) { max = -1000000000 for(i in a) if (a[i] > max) max = a[i]; return max } # find the min item in an array function least(a, i,min) { min = 1000000000 for(i in a) if (a[i] < min) min = a[i]; return min } # print array values function values(a,s,what, i,com) { print "" com = what ? "sort " what : "sort" for(i in a) print "% " s": " a[i] | com; close(com) } # print an array, sorted by "what" function o(a,s,what, i,com) { print "" com = what ? "sort " what : "sort " for(i in a) print a[i] ","i | com; close(com) } # print an array, sorted by "what" function oo(a,s,what, i,com) { print "" com = what ? "sort " what : "sort " for(i in a) print s "[ "i" ] = " a[i] | com; close(com) }