#!/usr/bin/gawk -f #

PRISM2

#

A "seperate and conqueor" rule generation algorithm. #

This file should be compiled using "am" (awk-with-macros). #

#Download from: ../src/prism
#Browse source: ../src/prism;
#See examples: ../eg/prism;
#View executable: ../bin/prism; (warning- verbose)
#

Preamble

#

Includes

#{ # replace these with the relevant functions #USES(globals) #USES(list/array) #USES(list/copy) #USES(string/saya) #}

Main

#{ BEGIN { FS=","; globals(); array(L) ; # table of local variables for this working memory array(Symbol); # maps symbols to an integer array(Lobmys); # maps integer to symbols array(Xs); # index of rows storing values array(E) ; # list of seen attribute value pairs array(Names) ; # list of column names array(Colvals); # frequency of column/value pairs array(Colval); # unique column values Wid = 1; # working memory id } { sub( /\%.*/,"") }; /^[ \t]*$/ { next } /@relation/ { FS=" "}; /@attribute/ { push2(Wid,$2,Names); }; /@data/ { FS="," } /@/ { next } Csv && ++Line==1 { for(I=1;I<=NF;I++) { push2(Wid,$I,Names) } next; } { Records++; print ">=== " Records; for(I=1;I<=NF;I++) update(Wid,NF,Records,I,$I); } END { rules(Wid) } function rules(w0, klasses,class,rule,covered,w) { classes(w0,klasses); class = popNextClass(w,klasses); print "top " class; if (! class) return; rules1(w0, class,rule,covered); print "IF " show(w0,rule) " THEN " class; w=cull(w0,covered); rules(w); } function rules1(w,class,rule,covered, col,as,score,perfect) { attrs(w,as); while( col = popNextAttr(as) ) { perfect = addBestValue(w,class,rule,covered,col); if (perfect) return; } } function addBestValue(w,class,rule,covered,col, pt,vs,maxs,maxpval,tmps,covered0,best) { vals(w,col,vs); maxs = -1; maxp = -1 while (val = popNextVal(col,vs)) { rule[col] = val; tmps = score(w, class,rule,covered0,pt); if (tmps > maxs) {copy(covered0,covered); maxs=tmps; best=val; } if (tmps==maxs && ps["p"]>maxp ) {copy(covered0,covered); maxs=tmps; best=val; maxp=ps["p"]; } }; rule[col]=best; return tmps == 1; } #}

Score one rule

#{ function score(w,target,rule,covered,pt, \ copyOfRule,ks,col,p,t,vl,steps,row,row1,max,val,i,m) { array(covered); copy(rule,copyOfRule); m=L[w,M]; #saya("rule",rule); while ( col = popHardestRuleCondition(w, copyOfRule) ) { p = t = 0; val = rule[col]; if (steps++) { for( row in covered ) if ( E[w,row,col] != val ) { delete covered[row]; } else { t++; p += E[w,row,m]==target } } else { max=Xs[w,val,0]; for(i=1;i<=max;i++) { row1=Xs[w,val,i]; covered[row1]=1; t++; p += E[w,row1,m] == target }} } pt["p"]=p; pt["t"]=t; return t ? p/t : 0; } #} #}

Working Memory Management

#{ function update(w, cols,row,col,val) { #val = sym(XI,val); # comment out when degugging if (col > L[w,M]) L[w,M]= col; if (cols > L[w,M]) L[w,M]= cols; if (row > L[w,N]) L[w,N]= row; E[w,row,col] = val; Xs[w,val,++Xs[w,val,0]] = row; if (++Colvals[w,col,val] == 1) Colval[w,col,++Colval[w,col,0]]=val } function sym(w,val) { if ( ! ((w,val) in Symbol) ) { Lobmys[w,++Lobmys[w,0]] = val; # define new symbol number Symbol[w,val] = Lobmys[w,0]; # map symbol to new number }; return Symbol[w,val]; } function cull(w0,covered,keep, rows,cols,col,i,n,w,max) { w= ++Wid; max=Names[w,0]=Names[w0,0]; for(i=1;i<=max;i++) Names[w,i]=Names[w0,i]; rows = L[w0,N]; cols = L[w0,M]; for(i=1;i<=rows;i++) if (! covered[i]) { n++; for(col=1;col<=cols;col++) update(w,cols,n,col,E[w0,i,col])} if (! keep) zaps(w0); # maybe, free some memory return w } #

Search control

#

Find all

#{ function vals(w,col,out, i,j,k) { j=Colval[w,col,0]; for(i=1;i<=j;i++) { k=Colval[w,col,i]; push(k,out)}; } function classes(w,out, m) { vals(w,L[w,M],out) } function array(a) { split("", a, "") } function attrs(w,out, i,m) { m=L[w,M]; for(i=1;iFind next best #{ function popNextVal(a,vs) { return pop(vs) } function popNextAttr(as) { return pop(as) } function popNextClass(w,klasses, x,m,i,tmp,max,best) { cols = L[w,M]; j = Colval[w,cols,0]; for(i=1;i<=j;i++) klass=Colval[w,cols,i]; for(i in klasses) if (i > 0) { tmp= Colvals[w,m,klasses[i]]; if (tmp> max) {max=tmp; x=i}}; best=klasses[x]; cut(x,klasses); return best; } function popHardestRuleCondition(w, rule, val,new,i,x,min) { min = Inf; for(i in rule) { val = rule[i]; new = Colvals[w,i,val]; if ( new < min ) { min = new; x = i } } delete rule[x]; return x; } #} #

Misc stuff

#{ function show(w,rule, str,col,sep,what) { for(col in rule) { what = ((w,col) in Names) ? Names[w,col] : col; str = str sep what "=" rule[col]; sep = " and "; }; return str } function prints() { saya("L",L); saya("Symbol", Symbol); # maps symbols to an integer saya("Lobmys", Lobmys); # maps integer to symbols saya("Xs", Xs); # index of rows storing values saya("E", E) ; # list of seen attribute value pairs saya("Names", Names) ; # list of column names saya("Colvals", Colvals); # frequency of column/value pairs saya("Colval", Colval); # unique column values } function zaps(wid) { zap(wid, L) ; # table of globals zap(wid, Symbol); # maps symbols to an integer zap(wid, Lobmys); # maps integer to symbols zap(wid, Xs); # index of rows storing values zap(wid, E) ; # list of seen attribute value pairs zap(wid, Names) ; # list of column names zap(wid, Colvals); # frequency of column/value pairs zap(wid, Colval); # unique column values } function zap(doomed,a, i) { doomed= want(doomed); for(i in a) if (i ~ doomed) delete a[i]; } function want(w) { return "^" w SUBSEP ".*[^0]$" } function push(x,a) { a[++a[0]]=x } function cut(x,a, y) { y=a[a[0]]; a[x]=y; delete a[a[0]--] }; function globals() { N = -1 M = -2 Inf = 10 ^ 32 NegInf = -1 * Inf Ee = 8.48456e+08 / 3.1213e+08 Pi = 4.28225e+14 / 1.36308e+14 Sp = " " Q = "\"" _ = SUBSEP White = "^[ \t\n\r]*$" Number = "^[+-]?([0-9]+[.]?[0-9]*|[.][0-9]+)([eE][+-]?[0-9]+)?$" } function push2(w,x,a) { a[w,++a[w,0]]=x } #function pop2(w,a, empty,n,top) { # n = a[w,0]; if(!n) {return empty} else {top=a[w,n]; delete a[w,n]; a[w,0]--; return top} #} function pop(a, empty, n,top) { n = a[0]; if(!n) {return empty} else {top=a[n]; delete a[n]; A[0]--; return top} } #} #

Library utilities