#!/usr/bin/gawk -f
#
PRISM2
#A "seperate and conqueor" rule generation algorithm.
#
This file should be compiled using "am" (awk-with-macros).
#
#Download from: ../src/prism
#Browse source: ../src/prism;
#See examples: ../eg/prism;
#View executable: ../bin/prism; (warning- verbose)
#
Preamble
#Includes
#{
# replace these with the relevant functions
#USES(globals)
#USES(list/array)
#USES(list/copy)
#USES(string/saya)
#} Main
#{
BEGIN {
FS=",";
globals();
array(L) ; # table of local variables for this working memory
array(Symbol); # maps symbols to an integer
array(Lobmys); # maps integer to symbols
array(Xs); # index of rows storing values
array(E) ; # list of seen attribute value pairs
array(Names) ; # list of column names
array(Colvals); # frequency of column/value pairs
array(Colval); # unique column values
Wid = 1; # working memory id
}
{ sub( /\%.*/,"") };
/^[ \t]*$/ { next }
/@relation/ { FS=" "};
/@attribute/ { push2(Wid,$2,Names); };
/@data/ { FS="," }
/@/ { next }
Csv && ++Line==1 { for(I=1;I<=NF;I++) {
push2(Wid,$I,Names) }
next;
}
{ Records++;
print ">=== " Records;
for(I=1;I<=NF;I++)
update(Wid,NF,Records,I,$I);
}
END { rules(Wid) }
function rules(w0, klasses,class,rule,covered,w) {
classes(w0,klasses);
class = popNextClass(w,klasses);
print "top " class;
if (! class) return;
rules1(w0, class,rule,covered);
print "IF " show(w0,rule) " THEN " class;
w=cull(w0,covered);
rules(w);
}
function rules1(w,class,rule,covered, col,as,score,perfect) {
attrs(w,as);
while( col = popNextAttr(as) ) {
perfect = addBestValue(w,class,rule,covered,col);
if (perfect) return;
}
}
function addBestValue(w,class,rule,covered,col,
pt,vs,maxs,maxpval,tmps,covered0,best) {
vals(w,col,vs);
maxs = -1;
maxp = -1
while (val = popNextVal(col,vs)) {
rule[col] = val;
tmps = score(w, class,rule,covered0,pt);
if (tmps > maxs) {copy(covered0,covered);
maxs=tmps; best=val; }
if (tmps==maxs && ps["p"]>maxp ) {copy(covered0,covered);
maxs=tmps; best=val; maxp=ps["p"]; }
};
rule[col]=best;
return tmps == 1;
}
#} Score one rule
#{
function score(w,target,rule,covered,pt, \
copyOfRule,ks,col,p,t,vl,steps,row,row1,max,val,i,m) {
array(covered);
copy(rule,copyOfRule);
m=L[w,M];
#saya("rule",rule);
while ( col = popHardestRuleCondition(w, copyOfRule) ) {
p = t = 0;
val = rule[col];
if (steps++) {
for( row in covered )
if ( E[w,row,col] != val ) {
delete covered[row];
} else {
t++;
p += E[w,row,m]==target }
} else {
max=Xs[w,val,0];
for(i=1;i<=max;i++) {
row1=Xs[w,val,i];
covered[row1]=1;
t++;
p += E[w,row1,m] == target }}
}
pt["p"]=p; pt["t"]=t;
return t ? p/t : 0;
}
#}
#} Working Memory Management
#{
function update(w, cols,row,col,val) {
#val = sym(XI,val); # comment out when degugging
if (col > L[w,M]) L[w,M]= col;
if (cols > L[w,M]) L[w,M]= cols;
if (row > L[w,N]) L[w,N]= row;
E[w,row,col] = val;
Xs[w,val,++Xs[w,val,0]] = row;
if (++Colvals[w,col,val] == 1)
Colval[w,col,++Colval[w,col,0]]=val
}
function sym(w,val) {
if ( ! ((w,val) in Symbol) ) {
Lobmys[w,++Lobmys[w,0]] = val; # define new symbol number
Symbol[w,val] = Lobmys[w,0]; # map symbol to new number
};
return Symbol[w,val];
}
function cull(w0,covered,keep, rows,cols,col,i,n,w,max) {
w= ++Wid;
max=Names[w,0]=Names[w0,0];
for(i=1;i<=max;i++)
Names[w,i]=Names[w0,i];
rows = L[w0,N];
cols = L[w0,M];
for(i=1;i<=rows;i++)
if (! covered[i]) {
n++;
for(col=1;col<=cols;col++)
update(w,cols,n,col,E[w0,i,col])}
if (! keep) zaps(w0); # maybe, free some memory
return w
}
# Search control
# Find all
#{
function vals(w,col,out, i,j,k) {
j=Colval[w,col,0];
for(i=1;i<=j;i++) {
k=Colval[w,col,i];
push(k,out)};
}
function classes(w,out, m) { vals(w,L[w,M],out) }
function array(a)
{
split("", a, "")
}
function attrs(w,out, i,m) { m=L[w,M]; for(i=1;iFind next best
#{
function popNextVal(a,vs) { return pop(vs) }
function popNextAttr(as) { return pop(as) }
function popNextClass(w,klasses, x,m,i,tmp,max,best) {
cols = L[w,M];
j = Colval[w,cols,0];
for(i=1;i<=j;i++)
klass=Colval[w,cols,i];
for(i in klasses)
if (i > 0) {
tmp= Colvals[w,m,klasses[i]];
if (tmp> max) {max=tmp; x=i}};
best=klasses[x];
cut(x,klasses);
return best;
}
function popHardestRuleCondition(w, rule, val,new,i,x,min) {
min = Inf;
for(i in rule) {
val = rule[i];
new = Colvals[w,i,val];
if ( new < min ) {
min = new;
x = i }
}
delete rule[x];
return x;
}
#}
# Misc stuff
#{
function show(w,rule, str,col,sep,what) {
for(col in rule) {
what = ((w,col) in Names) ? Names[w,col] : col;
str = str sep what "=" rule[col];
sep = " and ";
};
return str
}
function prints() {
saya("L",L);
saya("Symbol", Symbol); # maps symbols to an integer
saya("Lobmys", Lobmys); # maps integer to symbols
saya("Xs", Xs); # index of rows storing values
saya("E", E) ; # list of seen attribute value pairs
saya("Names", Names) ; # list of column names
saya("Colvals", Colvals); # frequency of column/value pairs
saya("Colval", Colval); # unique column values
}
function zaps(wid) {
zap(wid, L) ; # table of globals
zap(wid, Symbol); # maps symbols to an integer
zap(wid, Lobmys); # maps integer to symbols
zap(wid, Xs); # index of rows storing values
zap(wid, E) ; # list of seen attribute value pairs
zap(wid, Names) ; # list of column names
zap(wid, Colvals); # frequency of column/value pairs
zap(wid, Colval); # unique column values
}
function zap(doomed,a, i) {
doomed= want(doomed);
for(i in a)
if (i ~ doomed)
delete a[i];
}
function want(w) { return "^" w SUBSEP ".*[^0]$" }
function push(x,a) { a[++a[0]]=x }
function cut(x,a, y) { y=a[a[0]]; a[x]=y; delete a[a[0]--] };
function globals()
{
N = -1
M = -2
Inf = 10 ^ 32
NegInf = -1 * Inf
Ee = 8.48456e+08 / 3.1213e+08
Pi = 4.28225e+14 / 1.36308e+14
Sp = " "
Q = "\""
_ = SUBSEP
White = "^[ \t\n\r]*$"
Number = "^[+-]?([0-9]+[.]?[0-9]*|[.][0-9]+)([eE][+-]?[0-9]+)?$"
}
function push2(w,x,a) { a[w,++a[w,0]]=x }
#function pop2(w,a, empty,n,top) {
# n = a[w,0]; if(!n) {return empty} else {top=a[w,n]; delete a[w,n]; a[w,0]--; return top}
#}
function pop(a, empty, n,top) {
n = a[0]; if(!n) {return empty} else {top=a[n]; delete a[n];
A[0]--; return top}
}
#}
#Library utilities