#!/usr/bin/gawk -f
#
PRISM
#A "seperate and conqueor" rule generation algorithm.
#
This file should be compiled using "am" (awk-with-macros).
#
#Download from: ../src/prism
#Browse source: ../src/prism;
#See examples: ../eg/prism;
#View executable: ../bin/prism; (warning- verbose)
#
#% cat data
#age,spectaclePrescription,astigmatism,tearproductionRate,recommendedLenses
#young,myope,no,reduced,none
#young,myope,no,normal,soft
#young,myope,yes,reduced,none
#young,myope,yes,normal,hard
#young,hypermetrope,no,reduced,none
#young,hypermetrope,no,normal,soft
#young,hypermetrope,yes,reduced,none
#young,hypermetrope,yes,normal,hard
#pre-presbyopic,myope,no,reduced,none
#pre-presbyopic,myope,no,normal,soft
#pre-presbyopic,myope,yes,reduced,none
#pre-presbyopic,myope,yes,normal,hard
#pre-presbyopic,hypermetrope,no,reduced,none
#pre-presbyopic,hypermetrope,no,normal,soft
#pre-presbyopic,hypermetrope,yes,reduced,none
#pre-presbyopic,hypermetrope,yes,normal,none
#presbyopic,myope,no,reduced,none
#presbyopic,myope,no,normal,none
#presbyopic,myope,yes,reduced,none
#presbyopic,myope,yes,normal,hard
#presbyopic,hypermetrope,no,reduced,none
#presbyopic,hypermetrope,no,normal,soft
#presbyopic,hypermetrope,yes,reduced,none
#presbyopic,hypermetrope,yes,normal,none
#% prism Csv=1 data
#IF tearproductionRate=reduced THEN none
#IF tearproductionRate=normal and age=presbyopic and spectaclePrescription=hypermetrope and astigmatism=no THEN soft
#IF tearproductionRate=normal and spectaclePrescription=myope and astigmatism=yes THEN hard
#IF tearproductionRate=normal and age=pre-presbyopic and spectaclePrescription=hypermetrope and astigmatism=no THEN soft
#IF tearproductionRate=normal and age=presbyopic and spectaclePrescription=hypermetrope and astigmatism=yes THEN none
#IF tearproductionRate=normal and age=young and spectaclePrescription=hypermetrope and astigmatism=no THEN soft
#IF tearproductionRate=normal and age=pre-presbyopic and spectaclePrescription=hypermetrope and astigmatism=yes THEN none
#IF tearproductionRate=normal and age=pre-presbyopic and spectaclePrescription=myope and astigmatism=no THEN soft
#IF tearproductionRate=normal and age=presbyopic and spectaclePrescription=myope and astigmatism=no THEN none
#IF tearproductionRate=normal and age=young and spectaclePrescription=hypermetrope and astigmatism=yes THEN hard
#IF tearproductionRate=normal THEN soft
###
Preamble
#Includes
#{
# [globals.awk] added to end of files
# [list/array.awk] added to end of files
# [list/copy.awk] added to end of files
# [list/keys.awk] added to end of files
# [string/saya.awk] added to end of files
#} Working Memory
#{
function emptyI(names, classes, e, xs, symbol, lobmys, colvals, colval, g ) {
array(g) ; # table of globals
g[M] = 0; # number of columns
g[N] = 0; # number of rows
array(symbol); # maps symbols to an integer
array(lobmys); # maps integer to symbols
array(classes); # list of claseen counts
array(xs); # index of rows storing values
array(e) ; # list of seen attribute value pairs
array(names) ; # list of column names
array(colvals); # frequency of column/value pairs
array(colval); # unique column values
}
function printI(names, classes, e, xs, symbol, lobmys, colvals, colval, g ) {
print "N " g[N] " M " g[M];
saya("e", e); saya("xs", xs);
saya("names", names); saya("classes", classes);
saya("symbol", symbol); saya("lobm#ys", lobmys);
saya("colvals", colvals); saya("colval", colval);
}
#} Main
#{
BEGIN {
globals();
prism();
}
function prism( names, classes, e, xs, symbol, lobmys, colvals, colval, g ) {
emptyI(names, classes, e, xs, symbol, lobmys, colvals, colval, g );
read(names, classes, e, xs, symbol, lobmys, colvals, colval, g );
#print 3; saya("names",names); print 4;
if (Debug && Debug==1) {printI(names, classes, e, xs, symbol, lobmys, colvals, colval, g ); exit}
if (Debug && Debug==2) printI(names, classes, e, xs, symbol, lobmys, colvals, colval, g );
rules(names, classes, e, xs, symbol, lobmys, colvals, colval, g );
}
function read(names, classes, e, xs, symbol, lobmys, colvals, colval, g , m,n,i,lines) {
FS=",";
while (getline>0) {
sub( /\%.*/,"" );
if ( /^[ \t]*$/ ) continue;
if ( /@relation/ ) FS=" ";
if ( /@attribute/ ) names[++m]= $2;
if ( /@data/ ) FS=",";
if ( /@/ ) continue;
if (Csv && ++line==1 ) {
for(i=1;i<=NF;i++)
names[i]=$i;
} else {
n++;
for(i=1;i<=NF;i++)
update(names, classes, e, xs, symbol, lobmys, colvals, colval, g ,NF,n,i,$i) }}
}
function rules(names0,classes0,e0,xs0,symbol0,lobmys0,colvals0,colval0,g0, names, classes, e, xs, symbol, lobmys, colvals, colval, g ,class,rule,covered) {
if ( (class = popMostFrequentClass(classes0)) ) {
rules1(names0,classes0,e0,xs0,symbol0,lobmys0,colvals0,colval0,g0, class,rule,covered);
print "IF " show(names0,classes0,e0,xs0,symbol0,lobmys0,colvals0,colval0,g0,rule) " THEN " class;
cull(names0,classes0,e0,xs0,symbol0,lobmys0,colvals0,colval0,g0,names, classes, e, xs, symbol, lobmys, colvals, colval, g ,covered);
rules(names, classes, e, xs, symbol, lobmys, colvals, colval, g ) }
}
function rules1(names, classes, e, xs, symbol, lobmys, colvals, colval, g ,class,rule,covered, col,as,score,perfect) {
attrs(names, classes, e, xs, symbol, lobmys, colvals, colval, g ,as);
while( col = popNextAttr(as) ) {
perfect = addBestValue(names, classes, e, xs, symbol, lobmys, colvals, colval, g ,class,rule,covered,col);
if (perfect) return;
}
}
function addBestValue(names, classes, e, xs, symbol, lobmys, colvals, colval, g ,class,rule,covered,col, pt,vs,maxs,maxp, val,tmps,covered0,best) {
vals(names, classes, e, xs, symbol, lobmys, colvals, colval, g ,col,vs);
maxs = -1;
maxp = -1
while (val = popNextVal(col,vs)) {
rule[col] = val;
tmps = score(names, classes, e, xs, symbol, lobmys, colvals, colval, g , class,rule,covered0,pt);
if (tmps > maxs) {copy(covered0,covered); maxs=tmps; best=val; }
if (tmps==maxs && ps["p"]>maxp ) {copy(covered0,covered); maxs=tmps; best=val; maxp=ps["p"]; }
}
rule[col]=best;
return tmps == 1;
}
#} Working Memory Management
#{
function update(names, classes, e, xs, symbol, lobmys, colvals, colval, g , cols,row,col,val) {
#val = sym(XI,val); # comment out when degugging
if (col > g[M]) g[M]= col;
if (cols > g[M]) g[M]= cols;
if (row > g[N]) g[N]= row;
e[row,col] = val;
xs[val,++xs[val,0]] = row;
if (col==cols)
classes[val]++;
if (++colvals[col,val] == 1)
colval[col,++colval[col,0]]=val
}
function sym(names, classes, e, xs, symbol, lobmys, colvals, colval, g ,val) {
if ( ! (val in symbol) ) {
lobmys[++lobmys[0]] = val; # define new symbol number
symbol[val] = lobmys[0]; # map symbol to new number
};
return symbol[val];
}
function cull(names0,classes0,e0,xs0,symbol0,lobmys0,colvals0,colval0,g0,names, classes, e, xs, symbol, lobmys, colvals, colval, g ,covered,keep, rows,cols,col,i,n) {
emptyI(names, classes, e, xs, symbol, lobmys, colvals, colval, g );
rows = g0[N];
cols = g0[M];
copy(names0,names);
for(i=1;i<=rows;i++)
if (! covered[i]) {
n++;
for(col=1;col<=cols;col++)
update(names, classes, e, xs, symbol, lobmys, colvals, colval, g ,cols,n,col,e0[i,col])}
#print "kept " n " culled " rows - n;
if (! keep)
emptyI(names0,classes0,e0,xs0,symbol0,lobmys0,colvals0,colval0,g0); # free up some memory
}
#} Score one rule
#{
function score(names, classes, e, xs, symbol, lobmys, colvals, colval, g ,target,rule,covered,pt, \
copyOfRule,ks,col,p,t,vl,steps,row,row1,max,val,i,m) {
array(covered);
copy(rule,copyOfRule);
m=g[M];
#saya("rule",rule);
while ( col = popHardestRuleCondition(names, classes, e, xs, symbol, lobmys, colvals, colval, g , copyOfRule) ) {
p = t = 0;
val = rule[col];
if (steps++) {
for( row in covered )
if ( e[row,col] != val ) {
delete covered[row];
} else {
t++;
p += e[row,m]==target }#
} else {
max=xs[val,0];
for(i=1;i<=max;i++) {
row1=xs[val,i];
covered[row1]=1;
t++;
p += e[row1,m] == target }}
}
pt["p"]=p; pt["t"]=t;
return t ? p/t : 0;
}
#}
# Search control
# Find all
#{
function attrs(names, classes, e, xs, symbol, lobmys, colvals, colval, g ,out, i,j) {
j=g[M];
for(i=1;iFind next best
#{
function entropyAttrSelection (ent) {
#return ent[N] > 0 ? ent[ent[-1]--] : 0 ;
}
function popNextAttr(as) { # any attribute will do
return as[N] > 0 ? as[as[-1]--] : 0;
}
function popNextVal(a,vs) { # any attribute value will do
return vs[N] > 0 ? vs[vs[-1]--] : 0;
}
function popMostFrequentClass(classes, i,x,most) {
for(i in classes)
if (classes[i] > most)
most = classes[x=i];
delete classes[x];
return x;
}
function popHardestRuleCondition(names, classes, e, xs, symbol, lobmys, colvals, colval, g , rule, val,new,i,x,min) {
min = Inf;
for(i in rule) {
val = rule[i];
new = colvals[i,val];
if ( new < min ) {
min = new;
x = i }
}
delete rule[x];
return x;
}
#}
# Misc stuff
#{
function show(names, classes, e, xs, symbol, lobmys, colvals, colval, g ,rule, str,col,sep,what) {
# print 1
#saya("names4Tim",names)
# print 2
for(col in rule) {
what = (col in names) ? names[col] : col;
str = str sep what "=" rule[col];
sep = " and ";
};
return str
}
#}
#Library utilities
#globals.awk
#Create some handy globals.
#{
function globals() {
# magic array positions. should all be < 0 (so index==0 is an error)
N = -1; # height of stack (number of rows);
M = -2; # breadth of array (number of colums);
# numeric constants
Inf = 10^32;
NegInf = -1 * Inf;
Ee = 848456353 / 312129649;
Pi = 428224593349304 / 136308121570117; # good to 29 digits
}
#}
#array.awk
# Ensures an array is empty
#{
function array(a) { split("",a,"") }
#}
# copy.awk
# Empty the second argument, then fill it up only with the first.
#{
function copy(a,b, i) {
array(b);
for(i in a)
b[i]=a[i];
}
#}
# keys.awk
# Empty the second argument, then fill it up only with the keys from the first.
#{
function keys(a,k, i) {
array(k);
for(i in a)
if(i > 0)
k[++k[-1]]=i;
}
#}
#saya.awk
#Print a string in key-sort order.
#{
function saya(s,a,q1,q2,eol, i,j,n,tmp,str,sep) {
q1= q1 ? "\"" : "";
q2= q2 ? "\"" : "";
for(i in a) {
sep="";
str= s"[";
n=split(i,tmp,SUBSEP);
for(j=1;j<=n;j++) {
str=str sep q1 tmp[j] q1;
sep=",";
}
print str "] = " q2 a[i] q2 eol | "sort";
};
close("sort")
}
#}
#funtion