#!/usr/bin/gawk -f #

PRISM

#

A "seperate and conqueor" rule generation algorithm. #

This file should be compiled using "am" (awk-with-macros). #

#Download from: ../src/prism
#Browse source: ../src/prism;
#See examples: ../eg/prism;
#View executable: ../bin/prism; (warning- verbose)
# #% cat data #age,spectaclePrescription,astigmatism,tearproductionRate,recommendedLenses #young,myope,no,reduced,none #young,myope,no,normal,soft #young,myope,yes,reduced,none #young,myope,yes,normal,hard #young,hypermetrope,no,reduced,none #young,hypermetrope,no,normal,soft #young,hypermetrope,yes,reduced,none #young,hypermetrope,yes,normal,hard #pre-presbyopic,myope,no,reduced,none #pre-presbyopic,myope,no,normal,soft #pre-presbyopic,myope,yes,reduced,none #pre-presbyopic,myope,yes,normal,hard #pre-presbyopic,hypermetrope,no,reduced,none #pre-presbyopic,hypermetrope,no,normal,soft #pre-presbyopic,hypermetrope,yes,reduced,none #pre-presbyopic,hypermetrope,yes,normal,none #presbyopic,myope,no,reduced,none #presbyopic,myope,no,normal,none #presbyopic,myope,yes,reduced,none #presbyopic,myope,yes,normal,hard #presbyopic,hypermetrope,no,reduced,none #presbyopic,hypermetrope,no,normal,soft #presbyopic,hypermetrope,yes,reduced,none #presbyopic,hypermetrope,yes,normal,none #% prism Csv=1 data #IF tearproductionRate=reduced THEN none #IF tearproductionRate=normal and age=presbyopic and spectaclePrescription=hypermetrope and astigmatism=no THEN soft #IF tearproductionRate=normal and spectaclePrescription=myope and astigmatism=yes THEN hard #IF tearproductionRate=normal and age=pre-presbyopic and spectaclePrescription=hypermetrope and astigmatism=no THEN soft #IF tearproductionRate=normal and age=presbyopic and spectaclePrescription=hypermetrope and astigmatism=yes THEN none #IF tearproductionRate=normal and age=young and spectaclePrescription=hypermetrope and astigmatism=no THEN soft #IF tearproductionRate=normal and age=pre-presbyopic and spectaclePrescription=hypermetrope and astigmatism=yes THEN none #IF tearproductionRate=normal and age=pre-presbyopic and spectaclePrescription=myope and astigmatism=no THEN soft #IF tearproductionRate=normal and age=presbyopic and spectaclePrescription=myope and astigmatism=no THEN none #IF tearproductionRate=normal and age=young and spectaclePrescription=hypermetrope and astigmatism=yes THEN hard #IF tearproductionRate=normal THEN soft ###

Preamble

#

Includes

#{ # [globals.awk] added to end of files # [list/array.awk] added to end of files # [list/copy.awk] added to end of files # [list/keys.awk] added to end of files # [string/saya.awk] added to end of files #}

Working Memory

#{ function emptyI(names, classes, e, xs, symbol, lobmys, colvals, colval, g ) { array(g) ; # table of globals g[M] = 0; # number of columns g[N] = 0; # number of rows array(symbol); # maps symbols to an integer array(lobmys); # maps integer to symbols array(classes); # list of claseen counts array(xs); # index of rows storing values array(e) ; # list of seen attribute value pairs array(names) ; # list of column names array(colvals); # frequency of column/value pairs array(colval); # unique column values } function printI(names, classes, e, xs, symbol, lobmys, colvals, colval, g ) { print "N " g[N] " M " g[M]; saya("e", e); saya("xs", xs); saya("names", names); saya("classes", classes); saya("symbol", symbol); saya("lobm#ys", lobmys); saya("colvals", colvals); saya("colval", colval); } #}

Main

#{ BEGIN { globals(); prism(); } function prism( names, classes, e, xs, symbol, lobmys, colvals, colval, g ) { emptyI(names, classes, e, xs, symbol, lobmys, colvals, colval, g ); read(names, classes, e, xs, symbol, lobmys, colvals, colval, g ); #print 3; saya("names",names); print 4; if (Debug && Debug==1) {printI(names, classes, e, xs, symbol, lobmys, colvals, colval, g ); exit} if (Debug && Debug==2) printI(names, classes, e, xs, symbol, lobmys, colvals, colval, g ); rules(names, classes, e, xs, symbol, lobmys, colvals, colval, g ); } function read(names, classes, e, xs, symbol, lobmys, colvals, colval, g , m,n,i,lines) { FS=","; while (getline>0) { sub( /\%.*/,"" ); if ( /^[ \t]*$/ ) continue; if ( /@relation/ ) FS=" "; if ( /@attribute/ ) names[++m]= $2; if ( /@data/ ) FS=","; if ( /@/ ) continue; if (Csv && ++line==1 ) { for(i=1;i<=NF;i++) names[i]=$i; } else { n++; for(i=1;i<=NF;i++) update(names, classes, e, xs, symbol, lobmys, colvals, colval, g ,NF,n,i,$i) }} } function rules(names0,classes0,e0,xs0,symbol0,lobmys0,colvals0,colval0,g0, names, classes, e, xs, symbol, lobmys, colvals, colval, g ,class,rule,covered) { if ( (class = popMostFrequentClass(classes0)) ) { rules1(names0,classes0,e0,xs0,symbol0,lobmys0,colvals0,colval0,g0, class,rule,covered); print "IF " show(names0,classes0,e0,xs0,symbol0,lobmys0,colvals0,colval0,g0,rule) " THEN " class; cull(names0,classes0,e0,xs0,symbol0,lobmys0,colvals0,colval0,g0,names, classes, e, xs, symbol, lobmys, colvals, colval, g ,covered); rules(names, classes, e, xs, symbol, lobmys, colvals, colval, g ) } } function rules1(names, classes, e, xs, symbol, lobmys, colvals, colval, g ,class,rule,covered, col,as,score,perfect) { attrs(names, classes, e, xs, symbol, lobmys, colvals, colval, g ,as); while( col = popNextAttr(as) ) { perfect = addBestValue(names, classes, e, xs, symbol, lobmys, colvals, colval, g ,class,rule,covered,col); if (perfect) return; } } function addBestValue(names, classes, e, xs, symbol, lobmys, colvals, colval, g ,class,rule,covered,col, pt,vs,maxs,maxp, val,tmps,covered0,best) { vals(names, classes, e, xs, symbol, lobmys, colvals, colval, g ,col,vs); maxs = -1; maxp = -1 while (val = popNextVal(col,vs)) { rule[col] = val; tmps = score(names, classes, e, xs, symbol, lobmys, colvals, colval, g , class,rule,covered0,pt); if (tmps > maxs) {copy(covered0,covered); maxs=tmps; best=val; } if (tmps==maxs && ps["p"]>maxp ) {copy(covered0,covered); maxs=tmps; best=val; maxp=ps["p"]; } } rule[col]=best; return tmps == 1; } #}

Working Memory Management

#{ function update(names, classes, e, xs, symbol, lobmys, colvals, colval, g , cols,row,col,val) { #val = sym(XI,val); # comment out when degugging if (col > g[M]) g[M]= col; if (cols > g[M]) g[M]= cols; if (row > g[N]) g[N]= row; e[row,col] = val; xs[val,++xs[val,0]] = row; if (col==cols) classes[val]++; if (++colvals[col,val] == 1) colval[col,++colval[col,0]]=val } function sym(names, classes, e, xs, symbol, lobmys, colvals, colval, g ,val) { if ( ! (val in symbol) ) { lobmys[++lobmys[0]] = val; # define new symbol number symbol[val] = lobmys[0]; # map symbol to new number }; return symbol[val]; } function cull(names0,classes0,e0,xs0,symbol0,lobmys0,colvals0,colval0,g0,names, classes, e, xs, symbol, lobmys, colvals, colval, g ,covered,keep, rows,cols,col,i,n) { emptyI(names, classes, e, xs, symbol, lobmys, colvals, colval, g ); rows = g0[N]; cols = g0[M]; copy(names0,names); for(i=1;i<=rows;i++) if (! covered[i]) { n++; for(col=1;col<=cols;col++) update(names, classes, e, xs, symbol, lobmys, colvals, colval, g ,cols,n,col,e0[i,col])} #print "kept " n " culled " rows - n; if (! keep) emptyI(names0,classes0,e0,xs0,symbol0,lobmys0,colvals0,colval0,g0); # free up some memory } #}

Score one rule

#{ function score(names, classes, e, xs, symbol, lobmys, colvals, colval, g ,target,rule,covered,pt, \ copyOfRule,ks,col,p,t,vl,steps,row,row1,max,val,i,m) { array(covered); copy(rule,copyOfRule); m=g[M]; #saya("rule",rule); while ( col = popHardestRuleCondition(names, classes, e, xs, symbol, lobmys, colvals, colval, g , copyOfRule) ) { p = t = 0; val = rule[col]; if (steps++) { for( row in covered ) if ( e[row,col] != val ) { delete covered[row]; } else { t++; p += e[row,m]==target }# } else { max=xs[val,0]; for(i=1;i<=max;i++) { row1=xs[val,i]; covered[row1]=1; t++; p += e[row1,m] == target }} } pt["p"]=p; pt["t"]=t; return t ? p/t : 0; } #} #

Search control

#

Find all

#{ function attrs(names, classes, e, xs, symbol, lobmys, colvals, colval, g ,out, i,j) { j=g[M]; for(i=1;iFind next best #{ function entropyAttrSelection (ent) { #return ent[N] > 0 ? ent[ent[-1]--] : 0 ; } function popNextAttr(as) { # any attribute will do return as[N] > 0 ? as[as[-1]--] : 0; } function popNextVal(a,vs) { # any attribute value will do return vs[N] > 0 ? vs[vs[-1]--] : 0; } function popMostFrequentClass(classes, i,x,most) { for(i in classes) if (classes[i] > most) most = classes[x=i]; delete classes[x]; return x; } function popHardestRuleCondition(names, classes, e, xs, symbol, lobmys, colvals, colval, g , rule, val,new,i,x,min) { min = Inf; for(i in rule) { val = rule[i]; new = colvals[i,val]; if ( new < min ) { min = new; x = i } } delete rule[x]; return x; } #} #

Misc stuff

#{ function show(names, classes, e, xs, symbol, lobmys, colvals, colval, g ,rule, str,col,sep,what) { # print 1 #saya("names4Tim",names) # print 2 for(col in rule) { what = (col in names) ? names[col] : col; str = str sep what "=" rule[col]; sep = " and "; }; return str } #} #

Library utilities

#

globals.awk

#Create some handy globals. #{ function globals() { # magic array positions. should all be < 0 (so index==0 is an error) N = -1; # height of stack (number of rows); M = -2; # breadth of array (number of colums); # numeric constants Inf = 10^32; NegInf = -1 * Inf; Ee = 848456353 / 312129649; Pi = 428224593349304 / 136308121570117; # good to 29 digits } #} #

array.awk

# Ensures an array is empty #{ function array(a) { split("",a,"") } #} #

copy.awk

# Empty the second argument, then fill it up only with the first. #{ function copy(a,b, i) { array(b); for(i in a) b[i]=a[i]; } #} #

keys.awk

# Empty the second argument, then fill it up only with the keys from the first. #{ function keys(a,k, i) { array(k); for(i in a) if(i > 0) k[++k[-1]]=i; } #} #

saya.awk

#Print a string in key-sort order. #{ function saya(s,a,q1,q2,eol, i,j,n,tmp,str,sep) { q1= q1 ? "\"" : ""; q2= q2 ? "\"" : ""; for(i in a) { sep=""; str= s"["; n=split(i,tmp,SUBSEP); for(j=1;j<=n;j++) { str=str sep q1 tmp[j] q1; sep=","; } print str "] = " q2 a[i] q2 eol | "sort"; }; close("sort") } #} #funtion