# /* vim: set filetype=sh : */
### stop reading. broken after this usage: bash our minerc
# warning: requires at least 5MB of free disk
##########################################################################
# ourmine : a simple learning environment for data mining
# Copyright (C) 2007,2008, Tim Menzies (tim@menzies.us, http://menzies.us),
# Gregory Gay (gregoryg@csee.wvu.edu)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
##########################################################################
# for more info on command-line weka stuff, see
# http://www.cs.waikato.ac.nz/~remco/weka_bn/node13.html
Here=`pwd`
#### generic stuff
reload() {
cd $Here
. $Ourrc
}
show() {
local goal1="^$1"
local com="/^$1 /,/^}/{print}"
if (set | grep $goal1 | grep "=" > /tmp/debug)
then set | grep $goal1
else set | gawk "$com"
fi
}
blab() { printf "$*" >&2; }
blabln() { printf "$*\n" >&2; }
#### initialization stuff
setup() {
setUpVars
setUpDirs
}
lcsee() {
alias ls="ls --color"
}
build() {
export AWKPATH="$Here/minerc.lib:$AWKPATH"
}
setUpVars() {
alias ls="ls -G"
PROMPT_COMMAND='echo -ne "\033]0;${HOSTNAME}: `pwd`\007"'
PS1="Our MINE!: \!$ "
Ourmine="$HOME/opt/ourmine"
PATH="$Ourmine/bin:$HOME/bin:$PATH"
Safe=$Ourmine/var/safe
Dirs="$Our/lib/arffs"
LibUrl="http://unbox.org/wisp/trunk/our/minerc.lib/lib.zip"
export AWKPATH="$Ourmine/lib:$AWKPATH"
Bins=10
Repeats=2;
Learners="nb nbk"
Data="$Ourmine/lib/arff/uci/discrete/a*.arff
$Ourmine/lib/arff/uci/discrete/s*.arff"
Audit="pgawk --profile=$HOME/tmp/awkprof.out --dump-variables=$HOME/tmp/awkvars.out --lint"
}
setUpDirs() {
mkdir -p $HOME/tmp
mkdir -p /tmp/$USER
Tmp=`mktemp -d -p /tmp/$USER`
Weka="nice -19 java -Xmx1024M -cp $Tmp/weka.jar "
Rdr="/home/bryanl/svns/my-wisp/lib"
CompiledRDR="$Tmp/rdr"
#RdrDir="/home/bryanl/svns/my-wisp/lib/rdr-wisp/"
mkdir -p $Tmp
mkdir -p $Ourmine/lib # for support code
mkdir -p $Ourmine/bin # for our executables
mkdir -p $HOME/bin # for your executables
mkdir -p $Safe # for stuff you want to keep around
if [ ! -f "$Ourmine/lib/lib.zip" ]; then
downloads
fi
# I had too much trouble with pathname syntax problems
# on mac, windows, linux, etc. So now I just copy weka.jar
# to the working directory (no need for pathnames)
cp $Ourmine/lib/weka.jar $Tmp
#sbcl --dynamic-space-size 1024 --noinform --load "$Rdr/make-wisp-rdr.lisp"
}
downloads() {
set -x
(cd $Ourmine/lib
wget -O lib.zip $LibUrl
unzip -o lib.zip
)
set +x
}
#### stuff for the turkey experiment
setUpSeds() {
cat<<-EOF > $Tmp/etc/seds
s/loccodeandcomment/loc_code_and_comment/
s/locodeandcomment/loc_code_and_comment/
s/locandcomment/loc_code_and_comment/
s/essential_complexity/ev(g)/
s/cyclomatic_complexity/v(g)/
s/halstead_length/n/
s/halstead_level/l/
s/num_operators/n1/
s/num_operands/n2/
s/unique_operands/uniq_opnd/
s/unique_operators/uniq_op/
s/halstead_content/i/
s/halstead_error_est/b/
s/halstead_prog_time/t/
s/halstead_effort/e/
s/halstead_difficulty/d/
s/halstead_volume/v/
s/loc_comments/loc_comment/
s/design_complexity/iv(g)/
s/locomment/loc_comment/
s/loc_total/loc/
s/locode/loc/
s/[\t ]c[\t ]/ defects /
s/[\t ]problems[\t ]/ defects /
s/branchcout/branch_count/
s/total_op[\t ]/n1 /
s/total_opnd/n2/
s/{no,yes}/{false,true}/
EOF
}
prep() {
for i in $Dirs/mdp/*.arff ; do
cat $i |
tr A-Z a-z |
sed -f $Tmp/etc/seds \
> $Tmp/arff/`basename $i`
done
}
classes() {
local brief=0
while [ `echo $1 | grep "-"` ]; do
case $1 in
-b|--brief) brief=1;;
*) blabln "'"$1"' unknown\n usage cat file | classes [options]"
return 1;;
esac
shift 1
done
gawk '
BEGIN { OFS=FS=","
IGNORECASE=1
Brief=0 }
{ gsub(/#.*/,"") }
/^[ \t]*$/ { next }
Data && NF > 1 { Freq[$NF]++ }
/@data/ { Data=1 }
END {
for(N in Freq)
if (Brief) { print N } else { print Freq[N],N }}
' Brief=$brief -
}
intersectAttributes() {
# list the intersection of attributes
# found in a set of arff files
gawk '
BEGIN { IGNORECASE=1; OFS=","}
FNR==1 { Files++ }
/@attribute/ { Got[$2]++ }
END { for(A in Got)
if (Got[A]>=Files)
print A
}' $1
}
shared() {
for i in `intesectAttributes $Tmp/arff/*.arff |
sort |
grep -v defects`; do
echo $i
done
echo defects
}
some() {
# generate an arff file that only contains certaina attributes
gawk -f some.awk -v Some="$1" $2
}
makeshare() {
Shared=`shared`
for i in $Tmp/arff/*.arff; do
echo $i
some "$Shared" $i > $Tmp/shared/`basename $i`
done
}
report() {
gawk 'BEGIN {RS=""; FS="\n"}
NR==1 { M=split(Show,Shows,",") }
{ R[++N]=indent($0)
}
END {print " ";
for(r=1;r<=M;r++) printf("\n%s",R[Shows[r]]);
print ""; }
function str(n,chr, out) {
chr = chr ? chr : " ";
while(n-- > 0) out= out chr;
return out
}
function indent(str, i, out) {
for(i=1;i<=NF;i++)
out=out str(Indent," ") $i "\n"
return out
}
' Show=$2 Indent=$1 -
}
#### end inter intra stuff
#### misc utils
makeTrainTest() {
cat - | someArff --seed $1 --bins $2 --bin $3
}
gotwant() { gawk '
BEGIN {Unlog = 0;
OFS = ","
Ee = 848456353 / 312129649;
}
NF == 3 { if (UnLog) { print Ee^$2 , Ee^$3
} else { print $2,$3 }
}
NF == 4 { print $2 , $4 }
' -
}
sample() {
gawk -f $Here/lib/sample.awk -v File="$1" -v Wanted="$2" -v Sample="$3"
}
abcd() {
local goal="true|yes"
local before=""
local prefix=""
local decimals=2
while [ `echo $1 | grep "-"` ]; do
case $1 in
-d|--decimals) decimals=$2;;
-b|--before) before=$2;;
-p|--prefix) prefix=$2;;
-g|--goal) goal=$2;;
*) blabln "'"$1"' unknown\n usage abcd [options]";
return 1;;
esac
shift 2
done
[ -n "$before" ] && printf $before
gawk '
BEGIN {
Decimals = 3
Got = 1
Want = 2;
Prefix = "";
True = "true"; ## define symbol 1
A=B=C=D=0 ;
FS=OFS=","
GoalPd = 1;
GoalPf = 0;
}
function yes(s) {return s ~ True }
function no(s) {return ( yes(s) ? 0 : 1 ) }
{ sub(/#.*/,"") }
/^[ \t]*$/ { next }
NF==2 { N++;
Predicted=$Got;
Actual=$Want;
if (Predicted == Actual) Good++;
if (no( Actual) && no( Predicted)) A++;
if (yes(Actual) && no( Predicted)) B++;
if (no( Actual) && yes(Predicted)) C++;
if (yes(Actual) && yes(Predicted)) D++;
#print N,$0,A,B,C,D
}
END {
OFMT = "%." Decimals "f";
Balance=Precision=Accuracy=Pf=NotPf=Pd=0;
if (C+D > 0 ) Precision = D/(C+D);
if ((A+B+C+D) > 0) Accuracy = (A+D)/(A+B+C+D);
if (A+C > 0 ) Pf = C/(A+C)
if (B+D > 0 ) Pd = D/(B+D);
if (B+C+D > 0) { # special case- everything misses
Balance = 1 - sqrt((GoalPd - Pd)^2 + (GoalPf - Pf)^2)/sqrt(2)
}
if(Prefix) printf Txt=Prefix OFS;
print A,B,C,D,
sprintf(OFMT,100*Accuracy),
sprintf(OFMT,100*Pd),
sprintf(OFMT,100*Pf),
sprintf(OFMT,100*Precision),
sprintf(OFMT,100*Balance);
}' Prefix="$prefix" Decimals="$decimals" True="$goal" -
}
malign() {
cat - | gawk '
BEGIN { Width=1;
Gutter=1;
OFS=FS=",";
}
{ N++;
for(I=1;I<=NF;I++) {
if( (L=length($I)) > Max[I]) Max[I]=L;
++Data[N,0];
Data[N,I]=$I; }
}
END {for(J=1;J<=N;J++) {
Str=Sep1="";
if (Data[J,0]>1) {
for(I=1;I<=NF;I++) {
L=length(Data[J,I]);
Str = Str Sep1 \
str(most(Width,Max[I]+Gutter+1)-L," ") \
Data[J,I];
Sep1= OFS;
}}
else {Str=Data[J,1]}
print Str;}
}
function str(n,c, out) { while(--n > 0) out = out c; return out; }
function most(x,y) { return x > y ? x : y; }
'
}
quartile2tex() {
cat - | gawk 'BEGIN { FS=","
OFS="&"}
/===/ {print $1; last=0 ; rank=1; next}
NF==0 { print ""; next}
/#/ {next}
Last!=NF {
print (NF==7 ? "\\scriptsize\\begin{tabular}{rrrr} rank & treatment & median & distribution\\\\" : "stats") ; Last=NF}
NF==5 { print statsprint() }
NF==7 { print qprint() }
function statsprint( sep,i,out) {
if ($4 != last) {
rank++
}
last=$4
out = rank " & "
for(i=1;i<=NF;i++) {
out = out sep $i
sep="&"
}
return out "\\\\"
}
function qprint( sep,i,out) {
out = "1 & " $1 "&" $4 "&"
out=out "\\boxplot{"$2"}{"$3"}{"$4"}{"$5 - $3"}{"$6"}"
return out "\\\\"
}
function trim(s) {
gsub(/[ \t]/,"",s);
return s
} '
# gawk 'BEGIN {FS="\n"; RS=""}
# { M=split($0,Lines,/\\t/)
# Z=0;
# for(I=1;I<=M;I++) {
# N=split(Lines[I],Words,/\\t/)
# print ++Z " " trim(Lines[I])
# #for(J=1;J<=N; J++)
# # print Z " :: " J " :: " trim(Words[J])
# }
# }
# function trim(s) {
# sub(/^[ \t\n]*/,"",s);
# sub(/[ \t]\n*$/,"",s);
# return s
# }'
}
medians() {
local start="2"
while [ `echo $1 | grep "-"` ]; do
case $1 in
-s|--start) start=$2;;
*) blabln "'"$1"' unknown\n usage medians [options]";
return 1;;
esac
shift 2
done
gawk '
BEGIN{FS=","}
{print}
/^[ \t]*$/ {next}
/#/ {next}
{for(I=Start;I<=NF;I++) {
(Data[I,0]++); Data[I,Data[I,0]]=$I }
}
END{ #printf("#---")
#for(I=Start;I<=NF;I++)
# printf(",-----")
#print ""
printf("##");
printf $1
for(I=2;I 2 {
for(I=1;I<=Attr;I++)
if (I in Num)
if ($I !~ /\?/) {
if ( ($I +0) < Min) {Bad=1} else {Bad=0}
if (Bad) $I= Min;
$I=log($I)
}
print $0
}
' -
}
winLossTie() {
local fields=10
local key=1
local performance=$fields
local high=1
local confidence=95
local input="-"
while [ `echo $1 | grep "-"` ]; do
case $1 in
-f|--fields) fields=$2; shift 2;;
--99) confidence=99; shift 1;;
--95) confidence=95; shift 1;;
-k|--key) key=$2; shift 2;;
-p|--perform) performance=$2; shift 2;;
--high) high=1; shift 1;;
--low) high=0; shift 1;;
-i|--input) input=$2; shift 2;;
*) blabln "'"$1"' unknown\n. usage: winLossTie [options]"
return 1;;
esac
done
(echo "#key,ties,win,loss,win-loss"
gawk -f mwu.awk Fields=$fields Key=$key Performance=$performance \
High=$high Confidence=$confidence $input |
sort -t, -r -n -k 5,5
) | malign
}
someArff() {
#Q7: add command-line options to someArff to control the
# names of the generated test/train files (currently
# train.arff and test.arff). Remember to define default
# values for these variables and to update the help
# text. Hand in your new definition of "someArff"
local bins=3
local bin=1
local seed=$RANDOM
while [ `echo $1 | grep "-"` ]; do
case $1 in
-B|--bins) bins=$2;;
-b|--bin) bin=$2;;
-s|--seed) seed=$2;;
-h|--help) cat <<-EOF
someArff : divide an arrf file into Bins, create train/test files
usage: someArff [flags] arffFile
Flags
-B, --bins NUM Randomly divide the data into NUM bins
-b, --bin NUM Store bin NUM into test.arff and rest into train.arff
-s, --seed NUM Set the random number seed to NUM
-h, --help Print this text
EOF
return 1;;
*) blabln "'"$1"' unknown\n usage cat file | someArff [options]"
return 1;;
esac
shift 2
done
gawk '
BEGIN {
IGNORECASE=1;
Trainf="train.arff"; Testf="test.arff";
Bins=3;
Bin=1;
Seed=1;
}
/^[ \t]*$/ { next }
/@relation/ { Seed ? srand(Seed) : srand(1) }
/@relation/ { printf "">Trainf; printf "">Testf }
/@relation/,/@data/ { print $0 >> Trainf; print $0 >> Testf; next }
{ Line[rand()] = $0; Lines++ }
END {
###print Seed
Start = Lines/Bins * (Bin - 1) ;
Stop = Lines/Bins * Bin;
for(I in Line) {
N++;
What = (N>= Start && N < Stop) ? Testf : Trainf
print Line[I]>>What; }
}
' Seed=$seed Bins=$bins Bin=$bin -
}
#### Weka stuff
## pruning columns
removeAttributes() {
blab "/"
$Weka weka.filters.unsupervised.attribute.Remove \
-R "${1}-${2}" -i $3 -o tmp.arff
set +x
cat tmp.arff
}
## discretization
discretizeViaFayyadIrani() {
blab "x"
$Weka weka.filters.supervised.attribute.Discretize \
-c last -R first-last -i $1 -o tmp.arff
cat tmp.arff
}
## feature subset selection
rankViaInfoGain() {
blab "<"
$Weka weka.filters.supervised.attribute.AttributeSelection \
-S "weka.attributeSelection.Ranker -T -2.7976931348623157E308 -N -1" \
-E "weka.attributeSelection.InfoGainAttributeEval" \
-i $1 -o tmp.arff
cat tmp.arff
}
### learners
## classifiers
# rule-based classifiers
oner() {
blab "1"
$Weka weka.classifiers.rules.OneR \
-B 6 \
-p 0 -t $1 -T $2
}
jrip() {
blab "j"
$Weka weka.classifiers.rules.JRip \
-F 3 -N 2.0 -O 2 -S 1 \
-p 0 -t $1 -T $2
}
ridor() {
blab "R"
$Weka weka.classifiers.rules.Ridor -F 3 -N 2.0 -S 1 -t $1 -T $2 -p 0
}
jrip10() {
blab "j"
$Weka weka.classifiers.rules.JRip \
-F 3 -N 2.0 -O 2 -S 1 \
-t $1
}
part() {
blab "p"
$Weka weka.classifiers.rules.PART \
-M 2 -C 0.25 -Q 1 -t $1 -T $2
}
# bayesian classifiers
aode() {
blab "a"
$Weka weka.classifiers.bayes.AODE \
"-F" 0 \
-p 0 -t $1 -T $2
}
aode10() {
blab "a"
$Weka weka.classifiers.bayes.AODE \
"-F" 0 \
-t $1 -T $2
}
nb() {
blab "n"
$Weka weka.classifiers.bayes.NaiveBayes \
-p 0 -t $1 -T $2
}
nb10() {
blab "n"
$Weka weka.classifiers.bayes.NaiveBayes \
-i -t $1
}
nbk() {
blab "k"
$Weka weka.classifiers.bayes.NaiveBayes \
-K \
-p 0 -t $1 -T $2
}
lwl010() {
blab "L"
$Weka weka.classifiers.lazy.LWL \
-K 10 \
-p 0 -t $1 -T $2
}
lwl25() {
blab "L"
$Weka weka.classifiers.lazy.LWL \
-K 25 \
-p 0 -t $1 -T $2
}
lwl50() {
blab "L"
$Weka weka.classifiers.lazy.LWL \
-K 50 \
-p 0 -t $1 -T $2
}
lwl100() {
blab "L"
$Weka weka.classifiers.lazy.LWL \
-K 100 \
-p 0 -t $1 -T $2
}
pb() {
blab "pb"
java -jar PercentileBayes.jar $1 $2 $3 $4
}
# decision tree learners
j48() {
blab "4"
#blab $1
$Weka weka.classifiers.trees.J48 \
-C 0.25 -M 2 \
-p 0 -t $1 -T $2
}
j4810() {
blab "c"
$Weka weka.classifiers.trees.J48 \
-C 0.25 -M 2 \
-i -t $1
}
j4810c() {
blab "c$2"
$Weka weka.classifiers.trees.J48 \
-C $2 -M 2 \
-i -t $1
}
## linear-model learners
lsr() {
blab "L"
$Weka weka.classifiers.functions.LinearRegression \
-S 0 -R 1.0E-8 \
-p 0 -t $1 -T $2
}
m5p() {
blab "P"
$Weka weka.classifiers.trees.M5P \
-p 0 -t $1 -T $2
}
## nearest neighbor
1Bkx() {
blab "N"
$Weka weka.classifiers.lazy.IBk \
-K 1 -W 0 -X -E \
-p 0 -t $1 -T $2
}
1Bk() {
blab "n"
$Weka weka.classifiers.lazy.IBk \
-K -1 -W 0 -E \
-p 0 -t $1 -T $2
}
## association rule learners
apriori() {
blab "A"
$Weka weka.associations.Apriori \
-N 10 -T 0 -C 0.9 -D 0.05 -U 1.0 -M 0.1 -S -1.0 \
-p 0 -t $1 -T $2
}
#### teaching demos
weather.nominal() {
cat<<-EOF
@relation weather.nominal
@attribute outlook {sunny, overcast, rainy}
@attribute temperature {hot, mild, cool}
@attribute humidity {high, normal}
@attribute windy {TRUE, FALSE}
@attribute play {yes, no}
@data
sunny,hot,high,FALSE,no
sunny,hot,high,TRUE,no
overcast,hot,high,FALSE,yes
rainy,mild,high,FALSE,yes
rainy,cool,normal,FALSE,yes
rainy,cool,normal,TRUE,no
overcast,cool,normal,TRUE,yes
sunny,mild,high,FALSE,no
sunny,cool,normal,FALSE,yes
rainy,mild,normal,FALSE,yes
sunny,mild,normal,TRUE,yes
overcast,mild,high,TRUE,yes
overcast,hot,normal,FALSE,yes
rainy,mild,high,TRUE,no
EOF
}
auto93() { cat<<-EOF
@relation 'auto93.names'
@attribute Manufacturer { Acura, Audi, BMW, Buick, Cadillac, Chevrolet, Chrysler, Dodge, Eagle, Ford, Geo, Honda, Hyundai, Infiniti, Lexus, Lincoln, Mazda, Mercedes-Benz, Mercury, Mitsubishi, Nissan, Oldsmobile, Plymouth, Pontiac, Saab, Saturn, Subaru, Suzuki, Toyota, Volkswagen, Volvo}
@attribute Type { Small, Midsize, Compact, Large, Sporty, Van}
@attribute City_MPG real
@attribute Highway_MPG real
@attribute Air_Bags_standard { 0, 2, 1}
@attribute Drive_train_type { 1, 0, 2}
@attribute Number_of_cylinders real
@attribute Engine_size real
@attribute Horsepower real
@attribute RPM real
@attribute Engine_revolutions_per_mile real
@attribute Manual_transmission_available { 1, 0}
@attribute Fuel_tank_capacity real
@attribute Passenger_capacity real
@attribute Length real
@attribute Wheelbase real
@attribute Width real
@attribute U-turn_space real
@attribute Rear_seat_room real
@attribute Luggage_capacity real
@attribute Weight real
@attribute Domestic { 0, 1}
@attribute class real
@data
Acura,Small,25,31,0,1,4,1.8,140,6300,2890,1,13.2,5,177,102,68,37,26.5,11,2705,0,15.9
Acura,Midsize,18,25,2,1,6,3.2,200,5500,2335,1,18,5,195,115,71,38,30,15,3560,0,33.9
Audi,Compact,20,26,1,1,6,2.8,172,5500,2280,1,16.9,5,180,102,67,37,28,14,3375,0,29.1
Audi,Midsize,19,26,2,1,6,2.8,172,5500,2535,1,21.1,6,193,106,70,37,31,17,3405,0,37.7
BMW,Midsize,22,30,1,0,4,3.5,208,5700,2545,1,21.1,4,186,109,69,39,27,13,3640,0,30
Buick,Midsize,22,31,1,1,4,2.2,110,5200,2565,0,16.4,6,189,105,69,41,28,16,2880,1,15.7
Buick,Large,19,28,1,1,6,3.8,170,4800,1570,0,18,6,200,111,74,42,30.5,17,3470,1,20.8
Buick,Large,16,25,1,0,6,5.7,180,4000,1320,0,23,6,216,116,78,45,30.5,21,4105,1,23.7
Buick,Midsize,19,27,1,1,6,3.8,170,4800,1690,0,18.8,5,198,108,73,41,26.5,14,3495,1,26.3
Cadillac,Large,16,25,1,1,8,4.9,200,4100,1510,0,18,6,206,114,73,43,35,18,3620,1,34.7
Cadillac,Midsize,16,25,2,1,8,4.6,295,6000,1985,0,20,5,204,111,74,44,31,14,3935,1,40.1
Chevrolet,Compact,25,36,0,1,4,2.2,110,5200,2380,1,15.2,5,182,101,66,38,25,13,2490,1,13.4
Chevrolet,Compact,25,34,1,1,4,2.2,110,5200,2665,1,15.6,5,184,103,68,39,26,14,2785,1,11.4
Chevrolet,Sporty,19,28,2,0,6,3.4,160,4600,1805,1,15.5,4,193,101,74,43,25,13,3240,1,15.1
Chevrolet,Midsize,21,29,0,1,4,2.2,110,5200,2595,0,16.5,6,198,108,71,40,28.5,16,3195,1,15.9
Chevrolet,Van,18,23,0,1,6,3.8,170,4800,1690,0,20,7,178,110,74,44,30.5,?,3715,1,16.3
Chevrolet,Van,15,20,0,2,6,4.3,165,4000,1790,0,27,8,194,111,78,42,33.5,?,4025,1,16.6
Chevrolet,Large,17,26,1,0,8,5,170,4200,1350,0,23,6,214,116,77,42,29.5,20,3910,1,18.8
Chevrolet,Sporty,17,25,1,0,8,5.7,300,5000,1450,1,20,2,179,96,74,43,?,?,3380,1,38
Chrysler,Large,20,28,2,1,6,3.3,153,5300,1990,0,18,6,203,113,74,40,31,15,3515,1,18.4
Chrysler,Compact,23,28,2,1,4,3,141,5000,2090,0,16,6,183,104,68,41,30.5,14,3085,1,15.8
Chrysler,Large,20,26,1,1,6,3.3,147,4800,1785,0,16,6,203,110,69,44,36,17,3570,1,29.5
Dodge,Small,29,33,0,1,4,1.5,92,6000,3285,1,13.2,5,174,98,66,32,26.5,11,2270,1,9.2
Dodge,Small,23,29,1,1,4,2.2,93,4800,2595,1,14,5,172,97,67,38,26.5,13,2670,1,11.3
Dodge,Compact,22,27,1,1,4,2.5,100,4800,2535,1,16,6,181,104,68,39,30.5,14,2970,1,13.3
Dodge,Van,17,21,1,2,6,3,142,5000,1970,0,20,7,175,112,72,42,26.5,?,3705,1,19
Dodge,Midsize,21,27,1,1,4,2.5,100,4800,2465,0,16,6,192,105,69,42,30.5,16,3080,1,15.6
Dodge,Sporty,18,24,1,2,6,3,300,6000,2120,1,19.8,4,180,97,72,40,20,11,3805,1,25.8
Eagle,Small,29,33,0,1,4,1.5,92,6000,2505,1,13.2,5,174,98,66,36,26.5,11,2295,1,12.2
Eagle,Large,20,28,2,1,6,3.5,214,5800,1980,0,18,6,202,113,74,40,30,15,3490,1,19.3
Ford,Small,31,33,0,1,4,1.3,63,5000,3150,1,10,4,141,90,63,33,26,12,1845,1,7.4
Ford,Small,23,30,0,1,4,1.8,127,6500,2410,1,13.2,5,171,98,67,36,28,12,2530,1,10.1
Ford,Compact,22,27,0,1,4,2.3,96,4200,2805,1,15.9,5,177,100,68,39,27.5,13,2690,1,11.3
Ford,Sporty,22,29,1,0,4,2.3,105,4600,2285,1,15.4,4,180,101,68,40,24,12,2850,1,15.9
Ford,Sporty,24,30,1,1,4,2,115,5500,2340,1,15.5,4,179,103,70,38,23,18,2710,1,14
Ford,Van,15,20,1,2,6,3,145,4800,2080,1,21,7,176,119,72,45,30,?,3735,1,19.9
Ford,Midsize,21,30,1,1,6,3,140,4800,1885,0,16,5,192,106,71,40,27.5,18,3325,1,20.2
Ford,Large,18,26,1,0,8,4.6,190,4200,1415,0,20,6,212,114,78,43,30,21,3950,1,20.9
Geo,Small,46,50,0,1,3,1,55,5700,3755,1,10.6,4,151,93,63,34,27.5,10,1695,0,8.4
Geo,Sporty,30,36,1,1,4,1.6,90,5400,3250,1,12.4,4,164,97,67,37,24.5,11,2475,0,12.5
Honda,Sporty,24,31,2,1,4,2.3,160,5800,2855,1,15.9,4,175,100,70,39,23.5,8,2865,0,19.8
Honda,Small,42,46,1,1,4,1.5,102,5900,2650,1,11.9,4,173,103,67,36,28,12,2350,0,12.1
Honda,Compact,24,31,2,1,4,2.2,140,5600,2610,1,17,4,185,107,67,41,28,14,3040,0,17.5
Hyundai,Small,29,33,0,1,4,1.5,81,5500,2710,1,11.9,5,168,94,63,35,26,11,2345,0,8
Hyundai,Small,22,29,0,1,4,1.8,124,6000,2745,1,13.7,5,172,98,66,36,28,12,2620,0,10
Hyundai,Sporty,26,34,0,1,4,1.5,92,5550,2540,1,11.9,4,166,94,64,34,23.5,9,2285,0,10
Hyundai,Midsize,20,27,0,1,4,2,128,6000,2335,1,17.2,5,184,104,69,41,31,14,2885,0,13.9
Infiniti,Midsize,17,22,1,0,8,4.5,278,6000,1955,0,22.5,5,200,113,72,42,29,15,4000,0,47.9
Lexus,Midsize,18,24,1,1,6,3,185,5200,2325,1,18.5,5,188,103,70,40,27.5,14,3510,0,28
Lexus,Midsize,18,23,2,0,6,3,225,6000,2510,1,20.6,4,191,106,71,39,25,9,3515,0,35.2
Lincoln,Midsize,17,26,2,1,6,3.8,160,4400,1835,0,18.4,6,205,109,73,42,30,19,3695,1,34.3
Lincoln,Large,18,26,2,0,8,4.6,210,4600,1840,0,20,6,219,117,77,45,31.5,22,4055,1,36.1
Mazda,Small,29,37,0,1,4,1.6,82,5000,2370,1,13.2,4,164,97,66,34,27,16,2325,0,8.3
Mazda,Small,28,36,0,1,4,1.8,103,5500,2220,1,14.5,5,172,98,66,36,26.5,13,2440,0,11.6
Mazda,Compact,26,34,1,1,4,2.5,164,5600,2505,1,15.5,5,184,103,69,40,29.5,14,2970,0,16.5
Mazda,Van,18,24,0,2,6,3,155,5000,2240,0,19.6,7,190,110,72,39,27.5,?,3735,0,19.1
Mazda,Sporty,17,25,1,0,?,1.3,255,6500,2325,1,20,2,169,96,69,37,?,?,2895,0,32.5
Mercedes-Benz,Compact,20,29,1,0,4,2.3,130,5100,2425,1,14.5,5,175,105,67,34,26,12,2920,0,31.9
Mercedes-Benz,Midsize,19,25,2,0,6,3.2,217,5500,2220,0,18.5,5,187,110,69,37,27,15,3525,0,61.9
Mercury,Sporty,23,26,1,1,4,1.6,100,5750,2475,1,11.1,4,166,95,65,36,19,6,2450,1,14.1
Mercury,Midsize,19,26,0,0,6,3.8,140,3800,1730,0,18,5,199,113,73,38,28,15,3610,1,14.9
Mitsubishi,Small,29,33,0,1,4,1.5,92,6000,2505,1,13.2,5,172,98,67,36,26,11,2295,0,10.3
Mitsubishi,Midsize,18,24,1,1,6,3,202,6000,2210,0,19,5,190,107,70,43,27.5,14,3730,0,26.1
Nissan,Small,29,33,1,1,4,1.6,110,6000,2435,1,13.2,5,170,96,66,33,26,12,2545,0,11.8
Nissan,Compact,24,30,1,1,4,2.4,150,5600,2130,1,15.9,5,181,103,67,40,28.5,14,3050,0,15.7
Nissan,Van,17,23,0,1,6,3,151,4800,2065,0,20,7,190,112,74,41,27,?,4100,0,19.1
Nissan,Midsize,21,26,1,1,6,3,160,5200,2045,0,18.5,5,188,104,69,41,28.5,14,3200,0,21.5
Oldsmobile,Compact,24,31,0,1,4,2.3,155,6000,2380,0,15.2,5,188,103,67,39,28,14,2910,1,13.5
Oldsmobile,Midsize,23,31,1,1,4,2.2,110,5200,2565,0,16.5,5,190,105,70,42,28,16,2890,1,16.3
Oldsmobile,Van,18,23,0,1,6,3.8,170,4800,1690,0,20,7,194,110,74,44,30.5,?,3715,1,19.5
Oldsmobile,Large,19,28,1,1,6,3.8,170,4800,1570,0,18,6,201,111,74,42,31.5,17,3470,1,20.7
Plymouth,Sporty,23,30,0,2,4,1.8,92,5000,2360,1,15.9,4,173,97,67,39,24.5,8,2640,1,14.4
Pontiac,Small,31,41,0,1,4,1.6,74,5600,3130,1,13.2,4,177,99,66,35,25.5,17,2350,1,9
Pontiac,Compact,23,31,0,1,4,2,110,5200,2665,1,15.2,5,181,101,66,39,25,13,2575,1,11.1
Pontiac,Sporty,19,28,2,0,6,3.4,160,4600,1805,1,15.5,4,196,101,75,43,25,13,3240,1,17.7
Pontiac,Midsize,19,27,0,1,6,3.4,200,5000,1890,1,16.5,5,195,108,72,41,28.5,16,3450,1,18.5
Pontiac,Large,19,28,2,1,6,3.8,170,4800,1565,0,18,6,177,111,74,43,30.5,18,3495,1,24.4
Saab,Compact,20,26,1,1,4,2.1,140,6000,2910,1,18,5,184,99,67,37,26.5,14,2775,0,28.7
Saturn,Small,28,38,1,1,4,1.9,85,5000,2145,1,12.8,5,176,102,68,40,26.5,12,2495,1,11.1
Subaru,Small,33,37,0,2,3,1.2,73,5600,2875,1,9.2,4,146,90,60,32,23.5,10,2045,0,8.4
Subaru,Small,25,30,0,2,4,1.8,90,5200,3375,1,15.9,5,175,97,65,35,27.5,15,2490,0,10.9
Subaru,Compact,23,30,1,2,4,2.2,130,5600,2330,1,15.9,5,179,102,67,37,27,14,3085,0,19.5
Suzuki,Small,39,43,0,1,3,1.3,70,6000,3360,1,10.6,4,161,93,63,34,27.5,10,1965,0,8.6
Toyota,Small,32,37,1,1,4,1.5,82,5200,3505,1,11.9,5,162,94,65,36,24,11,2055,0,9.8
Toyota,Sporty,25,32,1,1,4,2.2,135,5400,2405,1,15.9,4,174,99,69,39,23,13,2950,0,18.4
Toyota,Midsize,22,29,1,1,4,2.2,130,5400,2340,1,18.5,5,188,103,70,38,28.5,15,3030,0,18.2
Toyota,Van,18,22,1,2,4,2.4,138,5000,2515,1,19.8,7,187,113,71,41,35,?,3785,0,22.7
Volkswagen,Small,25,33,0,1,4,1.8,81,5500,2550,1,12.4,4,163,93,63,34,26,10,2240,0,9.1
Volkswagen,Van,17,21,0,1,5,2.5,109,4500,2915,1,21.1,7,187,115,72,38,34,?,3960,0,19.7
Volkswagen,Compact,21,30,0,1,4,2,134,5800,2685,1,18.5,5,180,103,67,35,31.5,14,2985,0,20
Volkswagen,Sporty,18,25,0,1,6,2.8,178,5800,2385,1,18.5,4,159,97,66,36,26,15,2810,0,23.3
Volvo,Compact,21,28,1,0,4,2.3,114,5400,2215,1,15.8,5,190,104,67,37,29.5,14,2985,0,22.7
Volvo,Midsize,20,28,2,1,5,2.4,168,6200,2310,1,19.3,5,184,105,69,38,30,15,3245,0,26.7
EOF
}
auto93discreteClass() {
#some learners can't handle auto93's numeric class
#so we discretize the class. Note that this is a pretty
# dumb discretizer.
auto93 | gawk 'BEGIN {IGNORECASE=1; OFS=","; Round=20}
In && NF > 1 {$NF= "_"int($NF/Round+0.5)*Round}
$2 =="class" {$3 = "{_0,_20,_40,_60}"}
/@data/ {In=1; FS=","}
{ print}'
}
#### some workers
worker1001() {
for one in $Data; do
cp $one raw.arff
stem=`basename $one`
stem=${stem/.*/}
cat raw.arff | logNumbers arff > logged.arff
discretizeViaFayyadIrani raw.arff > discrete.arff
discretizeViaFayyadIrani logged.arff > loggedDiscrete.arff
for x in raw discrete logged loggedDiscrete; do
rankViaInfoGain $x.arff > ranked.arff
for Attrs in 4 7 13 16; do
removeAttributes $Attrs 16 $x.arff > ranked${Attrs}.arff
blab "$stem $x $Attrs "
echo "#file,x,attrs,bin,learner,a,b,c,d,acc,pd,pf,prec,g"
for((R=1;R<=$Repeats;R++)); do
Seed=$RANDOM
for((Bin=1; Bin <= $Bins ; Bin++)); do
blab "$Bin"
makeTrainTest $Seed $Bins $Bin ranked${Attrs}.arff
for Learner in $Learners; do
$Learner train.arff test.arff |
gotwant |
abcd "$stem,$x,$Attrs,$Bin,$Learner"
done
done
done | medians
blabln
done
done
done
}
worker1002() {
for one in $Data; do
cp $one raw.arff
stem=`basename $one`
stem=${stem/.*/}
logNumbers raw.arff > logged.arff
discretizeViaFayyadIrani raw.arff > discrete.arff
discretizeViaFayyadIrani logged.arff > loggedDiscrete.arff
for x in discrete loggedDiscrete; do
rankViaInfoGain $x.arff > ranked.arff
for Attrs in 4 7 13 16; do
removeAttributes $Attrs 16 $x.arff > ranked${Attrs}.arff
blab "$stem $x $Attrs "
Seed=$RANDOM
echo "#file,x,attrs,bin,learner,a,b,c,d,acc,pd,pf,prec,g"
for((R=1;R<=$Repeats;R++)); do
for((Bin=1; Bin <= $Bins ; Bin++)); do
blab "."
makeTrainTest $Seed $Bins $Bin ranked${Attrs}.arff
for Learner in $Learners; do
$Learner train.arff test.arff |
gotwant |
abcd "$stem,$x,$Attrs,$Bin,$Learner"
done
done
done | medians
blabln
done
done
done
}
#### some demos
demo3() {
cd $Tmp
weather.nominal > data.arff
j4810 data.arff
cd $Here
}
demo4() {
cd $Tmp
weather.nominal > data.arff
j4810 data.arff | report 4 3
cd $Here
}
demo5() {
cd $Tmp
weather.nominal > data.arff
j4810 data.arff | report 4 3,18,16
cd $Here
}
demo5a() {
j4810 $Ourmine/lib/arffs/uci/discrete/soybean.arff
}
demo6() {
cd $Tmp
weather.nominal > data.arff
nb10 data.arff
cd $Here
}
demo7() {
cd $Tmp
weather.nominal > data.arff
nb10 data.arff | report 4 2,3,4,5
cd $Here
}
demo8() {
cd $Tmp
weather.nominal > data.arff
nb10 data.arff | report 4 18,16
cd $Here
}
demo9() {
cd $Tmp
weather.nominal > data.arff
j48 data.arff data.arff
cd $Home
}
demo10() {
demo9 | gotwant
}
demo11() {
demo10 |
abcd --before "\na,b,c,d,acc,pd,pf,prec,bal\n" --decimals 1 |
malign
}
demo12() {
cd $Tmp
auto93discreteClass > data.arff
j48 data.arff data.arff | gotwant
cd $Here
}
demo13() {
demo12 | gawk -F, '$1 != $2'
}
demo14() {
demo12 |
for goal in _0 _20 _40 ; do
abcd --goal "$goal" \
--prefix "auto93d,$goal" \
--before "\n#data,goal,a,b,c,d,acc,pd,pf,prec,bal\n" \
--decimals 1
done | malign
}
demo15() {
cd $Tmp
( echo "#data,bin, a,b,c,d,acc,pd,pf,prec,bal"
seed=$RANDOM;
for((bin=1;bin<=10;bin++)); do
blab "$bin"
auto93discreteClass | someArff --seed $seed --bins $Bins --bin $bin
j48 train.arff test.arff | gotwant |
abcd --goal "_20" --prefix "auto93,$bin" --decimals 1
done | sort -t, -n -k 11,11
) | malign > demo15.csv #each bin
blabln " "
echo ""; cat demo15.csv
cp demo15.csv $Safe/demo15.csv
cd $Here
}
demo16() {
cd $Tmp
( echo "#data,repeat,bin,a,b,c,d,acc,pd,pf,prec,bal"
for((r=1;r<=2;r++)); do
blab "repeat=$r "
seed=$RANDOM;
for((bin=1;bin<=5;bin++)); do
blab "$bin"
auto93discreteClass | someArff --seed $seed --bins $Bins --bin $bin
j48 train.arff test.arff | gotwant |
abcd --goal "_20" --prefix "auto93,$r,$bin" --decimals 1
done
blabln
done | sort -t, -n -k 12,12
) | malign > demo15.csv #each bin
blabln " "
echo ""; cat demo15.csv
cp demo15.csv $Safe/demo15.csv
cd $Here
}
demo17() {
local me=demo17
local bins=5
local repeats=5
local learners="lwl50 nb"
local datas="cm1 kc1 kc2 kc3 mc2 mw1 pc1 ar3 ar4 ar5"
cd $Tmp
(echo "#data,repeat,bin,learner,goal,a,b,c,d,acc,pd,pf,prec,bal"
for data in $datas; do
arff=$Ourmine/lib/arffs/promise/$data.arff
cat $arff | logNumbers > logged.arff
discretizeViaFayyadIrani logged.arff > discrete.arff
for((r=1;r<=$repeats;r++)); do
blab "data=$data repeat=$r "
seed=$RANDOM;
for((bin=1;bin<=$bins;bin++)); do
blab "$bin"
#cat $arff | logNumbers > logged.arff
#discretizeViaFayyadIrani logged.arff > discrete.arff
cat discrete.arff | someArff --seed $seed --bins $bins --bin $bin
goals=`cat $arff | classes --brief`
for learner in $learners; do
$learner train.arff test.arff | gotwant > results.dat
for goal in $goals; do
cat results.dat |
abcd --goal "$goal" \
--prefix "$data,$r,$bin,$learner,$goal" \
--decimals 1
done
done
done
blabln
done
done
) | sort -t, -n -k 14,14 | malign > $me.csv #each bin
blabln " "
echo ""; cat $me.csv
cp $me.csv $Safe/$me.csv
cd $Here
}
demo18() {
local stats="$Safe/demo17.csv"
local learners="nb j48 oner"
local datas="diabetes autos"
[ ! -f "$stats" ] && demo17
(echo "#data,learner,goal,a,b,c,d,acc,pd,pf,prec,bal"
for data in $datas; do
for learner in $learners; do
echo -n "$data,$learner, "
grep $data $stats | grep $learner | medians --start 6 | grep "##"
done
done ) | malign | sort -t, -n -k 12,12
}
demo19() {
local stats="$Safe/demo17.csv"
[ ! -f "$stats" ] && demo17
winLossTie --input $stats --fields 14 --perform 14 --key 4 --95 --high
}
demoSample() {
local me=demoSample
local bins=10
local repeats=10
local learners="nb j48"
local datas="cm1 kc1"
cd $Tmp
(echo "#data,repeat,bin,sample,learner,goal,a,b,c,d,acc,pd,pf,prec,bal"
for data in $datas; do
arff=$Ourmine/lib/arffs/mdp/$data.arff
cat $arff | logNumbers > logged.arff
discretizeViaFayyadIrani logged.arff > loggedDiscrete.arff
for((r=1;r<=$repeats;r++)); do
blab "data=$data repeat=$r "
seed=$RANDOM;
for((bin=1;bin<=$bins;bin++)); do
blab "$bin"
cat loggedDiscrete.arff | someArff --seed $seed --bins $bins --bin $bin
for s in over under raw; do
sample train.arff true $s
goals="true"
for learner in $learners; do
$learner sampled.arff test.arff | gotwant > results.dat
for goal in $goals; do
cat results.dat |
abcd --goal "$goal" \
--prefix "$data,$r,$bin,$s,$learner,$goal" \
--decimals 1
done
done
done
done
blabln
done
done) | sort -t, -n -k 15,15 | malign > $me.csv #each bin
blabln " "
echo ""; cat $me.csv
cp $me.csv $Safe/$me.csv
cd $Here
}
addAliensData(){
gawk -v F=$1 -v Seed=$RANDOM -v Replace=$2 -v AlsoReplace=$3 -v Fill=$4 '
BEGIN {
lines=0;
srand(Seed);
while(getline x < F) {
if((x !~ /^[ \t]*$/)&&(x !~ /@attribute/)&&(x !~ /@data/)&&(x!="")&&(x !~ /@relation/)&&(x !~ /^%/)) {
filelines[lines]=x
lines++;
}
else if(x ~ /@attribute/) {
attcount++;
print x
}
else
print x
}
close(F);
for(j=0;j<=lines;j++) {
if(j<(lines/2))
print filelines[j];
else{
if(j%2==0) {
outline=""
split(filelines[j],thisline,",");
for(k=1;k=0; i--) {
x=int(rand()*1000);
while(x>(lines-5)) {
x = x/2;
}
if(file1lines[x] !="") {
print file1lines[x];
}
else
i++;
}
}
'
}
demoCombine() {
combineFiles $Ourmine/lib/arffs/uci/discrete/soybean.arff $Ourmine/lib/arffs/uci/discrete/soybean.arff > $Safe/soybean/combined.arff
}
combineFiles() {
gawk -v F=$1 -v G=$2 -v Seed=$RANDOM '
BEGIN {
lines=0;
lines2=0;
srand(Seed);
while(getline x < F) {
if((x !~ /^[ \t]*$/)&&(x !~ /@attribute/)&&(x !~ /@data/)&&(x!="")&&(x !~ /@relation/)&&(x !~ /^%/)) {
file1lines[lines]=x
lines++;
}
else
print x
}
close(F);
while(getline y < G) {
if((y !~ /^[ \t]*$/)&&(y !~ /@attribute/)&&(y !~ /@data/)&&(y!="")&&(y !~ /@relation/)&&(y !~ /^%/)) {
file2lines[lines2]=y
lines2++;
}
}
close(G);
for(i=0; i<=lines; i++) {
print file1lines[i];
}
for(j=0; j<=lines2; j++) {
print file2lines[j];
}
}
'
}
combineFilesRandom(){
gawk -v F=$1 -v G=$2 -v Seed=$RANDOM '
BEGIN {
lines=0;
lines2=0;
srand(Seed);
while(getline x < F) {
if((x !~ /^[ \t]*$/)&&(x !~ /@attribute/)&&(x !~ /@data/)&&(x!="")&&(x !~ /@relation/)&&(x !~ /^%/)) {
file1lines[lines]=x
lines++;
}
else
print x
}
close(F);
while(getline y < G) {
if((y !~ /^[ \t]*$/)&&(y !~ /@attribute/)&&(y !~ /@data/)&&(y!="")&&(y !~ /@relation/)&&(y !~ /^%/)) {
file2lines[lines2]=x
lines2++;
}
}
close(G);
for(i=lines; i>=0; i--) {
x=int(rand()*1000);
while(x>lines) {
x = x/2;
}
if(file1lines[x] !="") {
print file1lines[x];
}
else
i++;
if((lines2>0)&&(x y ? x : y }
function min(x,y) { return x < y ? x : y }
function sd(sumSq,sumX,n) {
return sqrt((sumSq-((sumX*sumX)/n))/(n-1));
}'
}
uniques() {
gawk -v F=$1 '
BEGIN {
acount=0;
lcount=0;
while(getline x < F) {
if(x ~ /@attribute/) {
split(x,line," ");
if((line[3] ~ numeric)||(line[3] ~ real))
atts[acount]="n";
else
atts[acount]="d";
acount++;
}
else if((x !~ /^[ \t]*$/)&&(x !~ /@attribute/)&&(x !~ /@data/)&&(x!="")&&(x !~ /@relation/)&&(x !~ /^%/)) {
split(x,line,",");
lcount++;
for(i=1;i<=acount;i++){ ##All attributes
ucount[i]=0;
qcount[i]=0;
if((lcount==1)&&(line[i] !~ /^?/)) {
ucount[i]++;
uniques[i","ucount[1]]=line[1];
}
else if(line[i] ~ /^?/) {
qcount[i]++;
}
else if(line[i] !~ /^?/) {
isu=true;
for(j in uniques) {
if(uniques[j]==line[i])
isu=false;
}
if(isu==true) {
ucount[i]++;
uniques[i","ucount[i]]=line[i];
}
}
}
}
}
close(F);
for(k=1;k<=acount;k++){
print "Uniques in column "k":";
}
}
'
}
demoCohen() {
local datas="iris sonar weather segment splice audiology vote mushroom labor" ##anneal autos breast-cancer colic diabetes glass"
local me="cohenResults"
local bins=10
local repeats="10"
local learners="j48 jrip part"
cd $Tmp
(echo "#data,repeat,bin,learner,goal,a,b,c,d,acc,pd,pf,prec,bal"
for data in $datas; do
arff=$Ourmine/lib/arffs/uci/discrete/$data.arff
for((r=1;r<=$repeats;r++)); do
blab "data=$data repeat=$r "
seed=$RANDOM;
for((bin=1;bin<=$bins;bin++)); do
blab "$bin"
cat $arff | someArff --seed $seed --bins $bins --bin $bin --train train.arff --test test.arff
goals==`cat $arff | classes --brief`
for learner in $learners; do
$learner train.arff test.arff | gotwant > results.dat
for goal in $goals; do
cat results.dat | tee myfile.dat |
abcd --goal "$goal" \
--prefix "$data,$r,$bin,$learner,$goal" \
--decimals 1
done
done
done
blabln
done
done) | sort -t, -n -k 14,14 | malign > $me.csv #each bin
blabln " "
echo ""; cat $me.csv
cp $me.csv $Safe/$me.csv
winLossTie --input $Safe/$me.csv --fields 14 --perform 14 --key 4 --95 --high
cd $Here
}
holteWorker() {
local datas="credit-g primary-tumor"
##"anneal audiology autos credit-a credit-g diabetes heart-h ionosphere letter primary-tumor segment sonar splice vehicle vowel waveform-5000"
##"v1"
##"iris soybean"
##"hepatitis labor lymph"
##"glass g2"
#####"breast-cancer kr-vs-kp heart-c colic hypothyroid mushroom sick vote"
for data in $datas; do
##processVotes $Ourmine/lib/arffs/uci/discrete/$data.arff > $Tmp/v$data.arff
processA $Ourmine/lib/arffs/uci/discrete/$data.arff > $Tmp/p$data.arff
demoHolte $Tmp/p$data.arff $data > $Safe/holte$data
done
}
processA(){
gawk -f $Here/lib/processArffs.awk -v F="$1"
}
processVotes(){
gawk -v F=$1 '
BEGIN {
while(getline x < F) {
if((x !~ /^[ \t]*$/)&&(x !~ /@attribute/)&&(x !~ /@data/)&&(x!="")&&(x !~ /@relation/)&&(x !~ /^%/)) {
split(x,line,",");
count=0;
for(c in line) {
count++;
}
out=line[1]
for(i=2;i<=count;i++) {
if(i!=4)
out=out","line[i];
}
print out
}
else
print x
}
close(F);
}
'
}
processGlass(){
gawk -v F=$1 -v S=$2 '
BEGIN{
if(S=="glass") {
while(getline x < F) {
if((x !~ /^[ \t]*$/)&&(x !~ /@attribute/)&&(x !~ /@data/)&&(x!="")&&(x !~ /@relation/)&&(x !~ /^%/)) {
split(x,line,",");
count=0;
for(c in line) {
count++;
}
out=line[2]
for(i=3;i<=count;i++) {
out=out","line[i];
}
print out
}
else
print x
}
close(F);
}
else {
while(getline x < F) {
if((x !~ /^[ \t]*$/)&&(x !~ /@attribute/)&&(x !~ /@data/)&&(x!="")&&(x !~ /@relation/)&&(x !~ /^%/)) {
split(x,line,",");
count=0;
for(c in line) {
count++;
}
if((x ~ /buildwindfloat/)||(x ~ /buildwindnon-float/)||(x ~ /vehicwindfloat/)) {
if(x ~ /vehicwindfloat/) {
line[count]="buildwindfloat"
}
out=line[1];
for(i=2;i<=count;i++) {
out=out","line[i];
}
print out
}
else
print x > /dev/null
}
else
print x
}
close(F);
}
}
'
}
demoHolte() {
local me="holteResults"
local bins=3
local repeats="25"
local learners="oner j48"
cd $Tmp
for learner in $learners; do
(echo "#data,repeat,bin,learner,goal,a,b,c,d,acc,pd,pf,prec,bal"
arff=$1
for((r=1;r<=$repeats;r++)); do
blab "data=$data repeat=$r "
seed=$RANDOM;
##for((bin=1;bin<=$bins;bin++)); do
## blab "$bin"
cat $arff | someArff --seed $seed --bins $bins --bin 1 --train train.arff --test test.arff
goals=`cat $arff | classes --brief`
###for learner in $learners; do
$learner train.arff test.arff | gotwant > results.dat
for goal in $goals; do
cat results.dat |
abcd --goal "$goal" \
--prefix "$data,$r,$bin,$learner,$goal" \
--decimals 1
done
## done
## done
blabln
done
) | sort -t, -n -k 10,10 | malign > $me.csv #each bin
blabln " "
echo ""; cat $me.csv | medians --start 2 | malign
cp $me.csv $Safe/$me$2.csv
cd $Here
done
}
demoEffort() {
local learners="nb"
local datas="cm1"
cd $Tmp
(for data in $datas; do
arff=$Ourmine/lib/arffs/mdp/$data.arff
addUniqueID $arff > $Tmp/dataWithIDs.arff
prepForSums $Tmp/dataWithIDs.arff > $Tmp/dataForSums.arff
seed=$RANDOM;
for ((b=1;b<=10;b+=1)); do
cat $Tmp/dataWithIDs.arff | someArff --train $Tmp/rest$b.arff --test $Tmp/test$b.arff --bin $b --bins 10 --seed $seed
for ((c=1;c<=10;c+=1)); do
if [ $c != $b ]; then
seed=$RANDOM
cat $Tmp/rest$b.arff | someArff --train $Tmp/train$b$c.arff --test $Tmp/ignored.arff --bin $c --bins 10 --seed $seed
blabln $b","$c
$Weka weka.classifiers.bayes.NaiveBayes -p 1 -t $Tmp/train$b$c.arff -T $Tmp/test$b.arff > $Tmp/dataToBeMarked$b$c
markInstances $Tmp/dataToBeMarked$b$c $Tmp/test$b.arff > markedData$b$c.arff
fi
done
for ((d=1;d<=10;d+=1)); do
current=$d
let "current -= 1"
if (( ("$b" == "1") && ("$d" == "1") )); then
echo " "
elif (( ("$b" =="1") && ("$d" == "2") )); then
sumInstances $Tmp/markedData$b$d.arff $Tmp/dataForSums.arff > $Tmp/dataForSums$b$d.arff
elif [ $d -eq "1" ]; then
sumInstances $Tmp/markedData$b$d.arff $Tmp/dataForSums.arff > $Tmp/dataForSums$b$d.arff
elif [ $d != $b ]; then
if (( "$d" == ("$b"+1) )); then
let "current -= 1"
fi
sumInstances $Tmp/markedData$b$d.arff $Tmp/dataForSums$b$current.arff > $Tmp/dataForSums$b$d.arff
fi
done
for ((threshold=1;threshold<=9;threshold+=1)); do
t=10
if [ $b != $t ]; then
labelInstances $Tmp/dataForSums$b$t.arff $threshold > labeledData$b$threshold.arff
echo "a,b,c,d,effort,acc,pd,pf,prec,bal" | malign
effortGotWant labeledData$b$threshold.arff | abcde --goal "defective" --input labeledData$b$threshold.arff | malign
else
labelInstances $Tmp/dataForSums109.arff $threshold > labeledData$b$threshold.arff
echo "a,b,c,d,effort,acc,pd,pf,prec,bal" | malign
effortGotWant labeledData$b$threshold.arff | abcde --goal "defective" --input labeledData$b$threshold.arff | malign
fi
done
done
done)
}
abcde() {
local goal="true|yes"
local before=""
local prefix=""
local decimals=2
local input=""
while [ `echo $1 | grep "-"` ]; do
case $1 in
-d|--decimals) decimals=$2;;
-b|--before) before=$2;;
-p|--prefix) prefix=$2;;
-g|--goal) goal=$2;;
-i|--input) input=$2;;
*) blabln "'"$1"' unknown\n usage abcd [options]";
return 1;;
esac
shift 2
done
[ -n "$before" ] && printf $before
gawk -v F=$input '
BEGIN {
Decimals = 3
Got = 1
Want = 2;
Prefix = "";
True = "true"; ## define symbol 1
A=B=C=D=Effort=0 ;
FS=OFS=","
GoalPd = 1;
GoalPf = 0;
Contents[0]=0;
}
function yes(s) {return s ~ True }
function no(s) {return ( yes(s) ? 0 : 1 ) }
{ sub(/#.*/,"") }
/^[ \t]*$/ { next }
NF==2 { N++;
Predicted=$Got;
Actual=$Want;
if (Predicted == Actual) Good++;
if (no( Actual) && no( Predicted)) {
A++;
Contents[N]="a";
}
if (yes(Actual) && no( Predicted)) {
B++;
Contents[N]="b";
}
if (no( Actual) && yes(Predicted)) {
C++;
Contents[N]="c";
}
if (yes(Actual) && yes(Predicted)) {
D++;
Contents[N]="d";
}
#print N,$0,A,B,C,D
}
END {
locodea=locodeb=locodec=locoded=0;
while(getline x < F){
if(x ~ /@attribute/) {
attCount++;
split(x,att," ");
if(att[2]=="lOCode")
loc=attCount;
}
else if((x !~ /^[ \t]*$/)&&(x !~ /@attribute/)&&(x !~ /@data/)&&(x!="")&&(x !~ /@relation/)&&(x !~ /^%/)) {
count++
split(x,words,",");
if(Contents[count]=="a")
locodea+=words[loc];
else if(Contents[count]=="b")
locodeb+=words[loc];
else if(Contents[count]=="c")
locodec+=words[loc];
else if(Contents[count]=="d")
locoded+=words[loc];
}
}
close(F);
Effort = (locodec+locoded)/(locodea+locodeb+locodec+locoded);
OFMT = "%." Decimals "f";
Balance=Precision=Accuracy=Pf=NotPf=Pd=0;
if (C+D > 0 ) Precision = D/(C+D);
if ((A+B+C+D) > 0) Accuracy = (A+D)/(A+B+C+D);
if (A+C > 0 ) Pf = C/(A+C)
if (B+D > 0 ) Pd = D/(B+D);
if (B+C+D > 0) { # special case- everything misses
Balance = 1 - sqrt((GoalPd - Pd)^2 + (GoalPf - Pf)^2)/sqrt(2)
}
if(Prefix) printf Txt=Prefix OFS;
print A,B,C,D,
sprintf(OFMT,100*Effort),
sprintf(OFMT,100*Accuracy),
sprintf(OFMT,100*Pd),
sprintf(OFMT,100*Pf),
sprintf(OFMT,100*Precision),
sprintf(OFMT,100*Balance);
}' Prefix="$prefix" Decimals="$decimals" True="$goal" -
}
effortGotWant() {
gawk -v F=$1 '
BEGIN {
while(getline x < F) {
if((x !~ /^[ \t]*$/)&&(x !~ /@attribute/)&&(x !~ /@data/)&&(x!="")&&(x !~ /@relation/)&&(x !~ /^%/)) {
split(x,words,",");
count=0;
for(f in words)
count++;
if(words[count]=="true")
actual="defective";
else
actual="notDefective";
print words[1]","actual;
}
}
close(F);
}
'
}
labelInstances() {
gawk -v F=$1 -v T=$2 '
BEGIN {
while(getline x < F) {
if(x ~ /@relation/) {
print x
print ""
print "@attribute defective {defective,notDefective}"
skip=1
}
else if (skip==1) {
print x > /dev/null;
skip=0;
}
else if((x !~ /^[ \t]*$/)&&(x !~ /@attribute/)&&(x !~ /@data/)&&(x!="")&&(x !~ /@relation/)&&(x !~ /^%/)) {
split(x,line,",");
count=0;
for(c in line) {
count++;
}
if(line[1]>=T)
out="defective"
else
out="notDefective"
for(i=1;i<=count;i++) {
out=out","line[i];
}
print out
}
else
print x
}
close(F);
}
'
}
sumInstances() {
gawk -v F=$1 -v O=$2 -v Debug=$Tmp/debug '
BEGIN {
count=1;
while(getline x < F) {
if((x !~ /^[ \t]*$/)&&(x !~ /@attribute/)&&(x !~ /@data/)&&(x!="")&&(x !~ /@relation/)&&(x !~ /^%/)) {
split(x,datas,",");
instances[datas[2]]=datas[1];
}
}
close(F);
for(d in datas)
count++;
while(getline y < O) {
if((y !~ /^[ \t]*$/)&&(y !~ /@attribute/)&&(y !~ /@data/)&&(y!="")&&(y !~ /@relation/)&&(y !~ /^%/)) {
split(y,infos,",");
line=0;
line=infos[1]+instances[infos[2]];
out=0;
out=line
for(i=2;i /dev/null;
skip=0;
}
else if((x !~ /^[ \t]*$/)&&(x !~ /@attribute/)&&(x !~ /@data/)&&(x!="")&&(x !~ /@relation/)&&(x !~ /^%/)) {
print 0","x;
}
else
print x
}
close(F);
}
'
}
markInstances() {
gawk -v F=$1 -v T=$2 '
BEGIN {
count=1;
while(getline x < F) {
split(x,words," ");
if(words[4]=="true")
instances[substr(words[5],2,length(words[5])-2)]=1;
else
instances[substr(words[5],2,length(words[5])-2)]=0;
}
close(F);
while(getline y < T) {
split(y,line,",");
if(y ~ /@relation/) {
print y
print ""
print "@attribute defective {0,1}"
skip=1
}
else if (skip==1) {
print y > /dev/null;
skip=0;
}
else if((y !~ /^[ \t]*$/)&&(y !~ /@attribute/)&&(y !~ /@data/)&&(y!="")&&(y !~ /@relation/)&&(y !~ /^%/)) {
print instances[line[1]]","y;
}
else
print y
}
close(T);
}
'
}
addUniqueID() {
gawk -v F=$1 '
BEGIN{
OFS=FS=",";
count=1
while (getline x < F){
if(x ~ /^%/)
print x > /dev/null;
else if(x ~ /@relation/) {
print x
print ""
print "@attribute UniqueID numeric"
skip=1
}
else if (skip==1) {
print x > /dev/null;
skip=0;
}
else if((x !~ /^[ \t]*$/)&&(x !~ /@attribute/)&&(x !~ /@data/)&&(x!="")&&(x !~ /@relation/)&&(x !~ /^%/)) {
print count","x;
count++;
}
else
print x
}
close(F);
}
'
}
### stop reading. broken after this/
flip() {
local data
local key
local performance
while [ `echo $1 | grep "-"` ]; do
case $1 in
-d|--data) data="$2";;
-k|--key) key="$2";;
-p|--performance) performance=$3;;
*) blabln "'"$1"' unknown\n usage cat file | flip [options]"
return 1;;
esac
shift 2
done
gawk '
BEGIN {FS=OFS=","}
NR==1 {
split(DataStr,TheData,/,/);
split(KeyStr,TheKeys,/,/);
}
/^[ \t]*#[^[#]/ {next}
{ key=data="";
for(d in TheData) data = data "." $d;
for(k in TheKeys) key = key "." $k;
Result[key,data]=$Performance;
if ($Performance > Max[data] ) { Max[data]=$Performance}
Keys[key]=key
Datas[data]=data
}
END {printf "#data"
for(K in Keys) printf "," K ",max?"
print ""
exit
for(D in Datas) {
printf D
for(K in Keys) {
printf "," Result[K,D]
printf (Result[K,D]== Max[D]) ? ",X" : ","
}
print ""
}}
' DataStr=$data KeyStr=$key Performance=$performance -
#| medians | malign
}
summary() {
cd $Tmp
local stats="$Safe/demo17.csv"
[ ! -f "$stats" ] && demo17
demo18 | flip --data 1 --key 2 --performance 12
printf "\n---| all |------\n\n"
winLossTie --input $stats --fields 14 --perform 14 --key 4 --95 --high
for d in diabetes autos; do
printf "\n---| $d |------\n\n"
grep $d $stats > $d.stats;
winLossTie --input $d.stats --fields 14 --perform 14 --key 4 --95 --high
done
}
demo101() {
local me=demo101
local stats="$HOME/tmp/safe/demo2.log"
local learners="aode j48 jrip nb oner"
local preps="loggedDiscrete discrete"
local datas="cm1 kc1 kc2 kc3_mod mc1_mod mc2_mod mw1_mod
pc1 pc2_mod pc3_mod pc4_mod pc5_mod"
(echo "#data,prep,attrs,bin,learner,a,b,c,d,acc,pd,pf,prec,bal"
for data in $datas; do
for learner in $learners; do
for prep in $preps; do
cat $stats |
grep -v '#' |
grep $data |
grep $prep |
grep $learner | sort -t, -n -k 14,14 | medians --start 6
done
done
done
) > $Safe/$me.log
cat $Safe/$me.log
}
demo102() {
cd $Tmp
local stats="$Safe/demo101.log"
[ ! -f "$stats" ] && demo101
cat $stats | grep "##" | flip --data 1 --key 2,5 --performance 14
}
# oner nb j48
#auto 56.6 60.4 85.9*
#diabetes 57.2 68.5 69.3*
#
#demo10() {
# demo9 | gawk -F, '/@/ {next}
# NF>1 {print $NF}' | sort | uniq -c
#}
#demo11() {
# setup; cd $Tmp
# demo9 > data.arff
#
# c=0.1
# printf "confidence limit for pruning = $c (very selective)\n\n"
# j4810c data.arff $c | report 0 3,18,16
#
# c=0.25
# printf "confidence limit for pruning = $c (default, less selective)\n\n"
# j4810c data.arff $c | report 0 3,18,16
# cd $Here
#}
#demo1001() {
# setUpVars
# setUpDirs
# setUpSeds
# prep
# cd $Tmp
# pwd
# makeshare
# worker1001 > log
# cp log $Safe/demo1.log
# winLossTie log | tee $Safe/demo1.winLossTie
#}
#demo1002() {
# setUpVars
# setUpDirs
# setUpSeds
# prep
# cd $Tmp
# pwd
# makeshare
# Learners="j48 jrip oner nb aode"
# worker1002 > log
# cp log $Safe/demo1.log
# winLossTie log | tee $Safe/demo1.winLossTie
#}
#### start up
setup
. worker.sh
blabln "OurMine version v2.0 (c)2007-2008 tim@menzies.us/gregoryg@csee.wvu.edu under GPLv3"
blabln "Too many doings, not enough learnings.\n"
. greg.sh
. bryan.sh
#. ./../../lib/rdr/wisp-rdr.sh