# /* vim: set filetype=sh : */ ### stop reading. broken after this usage: bash our minerc # warning: requires at least 5MB of free disk ########################################################################## # ourmine : a simple learning environment for data mining # Copyright (C) 2007,2008, Tim Menzies (tim@menzies.us, http://menzies.us), # Gregory Gay (gregoryg@csee.wvu.edu) # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, version 3. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . ########################################################################## # for more info on command-line weka stuff, see # http://www.cs.waikato.ac.nz/~remco/weka_bn/node13.html Here=`pwd` #### generic stuff reload() { cd $Here . $Ourrc } show() { local goal1="^$1" local com="/^$1 /,/^}/{print}" if (set | grep $goal1 | grep "=" > /tmp/debug) then set | grep $goal1 else set | gawk "$com" fi } blab() { printf "$*" >&2; } blabln() { printf "$*\n" >&2; } #### initialization stuff setup() { setUpVars setUpDirs } lcsee() { alias ls="ls --color" } build() { export AWKPATH="$Here/minerc.lib:$AWKPATH" } setUpVars() { alias ls="ls -G" PROMPT_COMMAND='echo -ne "\033]0;${HOSTNAME}: `pwd`\007"' PS1="Our MINE!: \!$ " Ourmine="$HOME/opt/ourmine" PATH="$Ourmine/bin:$HOME/bin:$PATH" Safe=$Ourmine/var/safe Dirs="$Our/lib/arffs" LibUrl="http://unbox.org/wisp/trunk/our/minerc.lib/lib.zip" export AWKPATH="$Ourmine/lib:$AWKPATH" Bins=10 Repeats=2; Learners="nb nbk" Data="$Ourmine/lib/arff/uci/discrete/a*.arff $Ourmine/lib/arff/uci/discrete/s*.arff" Audit="pgawk --profile=$HOME/tmp/awkprof.out --dump-variables=$HOME/tmp/awkvars.out --lint" } setUpDirs() { mkdir -p $HOME/tmp mkdir -p /tmp/$USER Tmp=`mktemp -d -p /tmp/$USER` Weka="nice -19 java -Xmx1024M -cp $Tmp/weka.jar " mkdir -p $Tmp mkdir -p $Ourmine/lib # for support code mkdir -p $Ourmine/bin # for our executables mkdir -p $HOME/bin # for your executables mkdir -p $Safe # for stuf/Sandf you want to keep around if [ ! -f "$Ourmine/lib/lib.zip" ]; then downloads fi # I had too much trouble with pathname syntax problems # on mac, windows, linux, etc. So now I just copy weka.jar # to the working directory (no need for pathnames) cp $Ourmine/lib/weka.jar $Tmp } downloads() { set -x (cd $Ourmine/lib wget -O lib.zip $LibUrl unzip -o lib.zip ) set +x } #### stuff for the turkey experiment setUpSeds() { cat<<-EOF > $Tmp/etc/seds s/loccodeandcomment/loc_code_and_comment/ s/locodeandcomment/loc_code_and_comment/ s/locandcomment/loc_code_and_comment/ s/essential_complexity/ev(g)/ s/cyclomatic_complexity/v(g)/ s/halstead_length/n/ s/halstead_level/l/ s/num_operators/n1/ s/num_operands/n2/ s/unique_operands/uniq_opnd/ s/unique_operators/uniq_op/ s/halstead_content/i/ s/halstead_error_est/b/ s/halstead_prog_time/t/ s/halstead_effort/e/ s/halstead_difficulty/d/ s/halstead_volume/v/ s/loc_comments/loc_comment/ s/design_complexity/iv(g)/ s/locomment/loc_comment/ s/loc_total/loc/ s/locode/loc/ s/[\t ]c[\t ]/ defects / s/[\t ]problems[\t ]/ defects / s/branchcout/branch_count/ s/total_op[\t ]/n1 / s/total_opnd/n2/ s/{no,yes}/{false,true}/ EOF } prep() { for i in $Dirs/mdp/*.arff ; do cat $i | tr A-Z a-z | sed -f $Tmp/etc/seds \ > $Tmp/arff/`basename $i` done } classes() { local brief=0 while [ `echo $1 | grep "-"` ]; do case $1 in -b|--brief) brief=1;; *) blabln "'"$1"' unknown\n usage cat file | classes [options]" return 1;; esac shift 1 done gawk ' BEGIN { OFS=FS="," IGNORECASE=1 Brief=0 } { gsub(/#.*/,"") } /^[ \t]*$/ { next } Data && NF > 1 { Freq[$NF]++ } /@data/ { Data=1 } END { for(N in Freq) if (Brief) { print N } else { print Freq[N],N }} ' Brief=$brief - } intersectAttributes() { # list the intersection of attributes # found in a set of arff files gawk ' BEGIN { IGNORECASE=1; OFS=","} FNR==1 { Files++ } /@attribute/ { Got[$2]++ } END { for(A in Got) if (Got[A]>=Files) print A }' $1 } shared() { for i in `intesectAttributes $Tmp/arff/*.arff | sort | grep -v defects`; do echo $i done echo defects } some() { # generate an arff file that only contains certaina attributes gawk -f some.awk -v Some="$1" $2 } makeshare() { Shared=`shared` for i in $Tmp/arff/*.arff; do echo $i some "$Shared" $i > $Tmp/shared/`basename $i` done } report() { gawk 'BEGIN {RS=""; FS="\n"} NR==1 { M=split(Show,Shows,",") } { R[++N]=indent($0) } END {print " "; for(r=1;r<=M;r++) printf("\n%s",R[Shows[r]]); print ""; } function str(n,chr, out) { chr = chr ? chr : " "; while(n-- > 0) out= out chr; return out } function indent(str, i, out) { for(i=1;i<=NF;i++) out=out str(Indent," ") $i "\n" return out } ' Show=$2 Indent=$1 - } #### end inter intra stuff #### misc utils makeTrainTest() { cat - | someArff --seed $1 --bins $2 --bin $3 } gotwant() { gawk ' BEGIN {Unlog = 0; OFS = "," Ee = 848456353 / 312129649; } NF == 3 { if (UnLog) { print Ee^$2 , Ee^$3 } else { print $2,$3 } } NF == 4 { print $2 , $4 } ' - } sample() { gawk -f $Here/lib/sample.awk -v File="$1" -v Wanted="$2" -v Sample="$3" } abcd() { local goal="true|yes" local before="" local prefix="" local decimals=2 while [ `echo $1 | grep "-"` ]; do case $1 in -d|--decimals) decimals=$2;; -b|--before) before=$2;; -p|--prefix) prefix=$2;; -g|--goal) goal=$2;; *) blabln "'"$1"' unknown\n usage abcd [options]"; return 1;; esac shift 2 done [ -n "$before" ] && printf $before gawk ' BEGIN { Decimals = 3 Got = 1 Want = 2; Prefix = ""; True = "true"; ## define symbol 1 A=B=C=D=0 ; FS=OFS="," GoalPd = 1; GoalPf = 0; } function yes(s) {return s ~ True } function no(s) {return ( yes(s) ? 0 : 1 ) } { sub(/#.*/,"") } /^[ \t]*$/ { next } NF==2 { N++; Predicted=$Got; Actual=$Want; if (Predicted == Actual) Good++; if (no( Actual) && no( Predicted)) A++; if (yes(Actual) && no( Predicted)) B++; if (no( Actual) && yes(Predicted)) C++; if (yes(Actual) && yes(Predicted)) D++; #print N,$0,A,B,C,D } END { OFMT = "%." Decimals "f"; Balance=Precision=Accuracy=Pf=NotPf=Pd=0; if (C+D > 0 ) Precision = D/(C+D); if ((A+B+C+D) > 0) Accuracy = (A+D)/(A+B+C+D); if (A+C > 0 ) Pf = C/(A+C) if (B+D > 0 ) Pd = D/(B+D); if (B+C+D > 0) { # special case- everything misses Balance = 1 - sqrt((GoalPd - Pd)^2 + (GoalPf - Pf)^2)/sqrt(2) } if(Prefix) printf Txt=Prefix OFS; print A,B,C,D, sprintf(OFMT,100*Accuracy), sprintf(OFMT,100*Pd), sprintf(OFMT,100*Pf), sprintf(OFMT,100*Precision), sprintf(OFMT,100*Balance); }' Prefix="$prefix" Decimals="$decimals" True="$goal" - } malign() { cat - | gawk ' BEGIN { Width=1; Gutter=1; OFS=FS=","; } { N++; for(I=1;I<=NF;I++) { if( (L=length($I)) > Max[I]) Max[I]=L; ++Data[N,0]; Data[N,I]=$I; } } END {for(J=1;J<=N;J++) { Str=Sep1=""; if (Data[J,0]>1) { for(I=1;I<=NF;I++) { L=length(Data[J,I]); Str = Str Sep1 \ str(most(Width,Max[I]+Gutter+1)-L," ") \ Data[J,I]; Sep1= OFS; }} else {Str=Data[J,1]} print Str;} } function str(n,c, out) { while(--n > 0) out = out c; return out; } function most(x,y) { return x > y ? x : y; } ' } quartile2tex() { cat - | gawk 'BEGIN { FS="," OFS="&"} /===/ {print $1; last=0 ; rank=1; next} NF==0 { print ""; next} /#/ {next} Last!=NF { print (NF==7 ? "\\scriptsize\\begin{tabular}{rrrr} rank & treatment & median & distribution\\\\" : "stats") ; Last=NF} NF==5 { print statsprint() } NF==7 { print qprint() } function statsprint( sep,i,out) { if ($4 != last) { rank++ } last=$4 out = rank " & " for(i=1;i<=NF;i++) { out = out sep $i sep="&" } return out "\\\\" } function qprint( sep,i,out) { out = "1 & " $1 "&" $4 "&" out=out "\\boxplot{"$2"}{"$3"}{"$4"}{"$5 - $3"}{"$6"}" return out "\\\\" } function trim(s) { gsub(/[ \t]/,"",s); return s } ' # gawk 'BEGIN {FS="\n"; RS=""} # { M=split($0,Lines,/\\t/) # Z=0; # for(I=1;I<=M;I++) { # N=split(Lines[I],Words,/\\t/) # print ++Z " " trim(Lines[I]) # #for(J=1;J<=N; J++) # # print Z " :: " J " :: " trim(Words[J]) # } # } # function trim(s) { # sub(/^[ \t\n]*/,"",s); # sub(/[ \t]\n*$/,"",s); # return s # }' } medians() { local start="2" while [ `echo $1 | grep "-"` ]; do case $1 in -s|--start) start=$2;; *) blabln "'"$1"' unknown\n usage medians [options]"; return 1;; esac shift 2 done gawk ' BEGIN{FS=","} {print} /^[ \t]*$/ {next} /#/ {next} {for(I=Start;I<=NF;I++) { (Data[I,0]++); Data[I,Data[I,0]]=$I } } END{ #printf("#---") #for(I=Start;I<=NF;I++) # printf(",-----") #print "" printf("##"); printf $1 for(I=2;I 2 { for(I=1;I<=Attr;I++) if (I in Num) if ($I !~ /\?/) { if ( ($I +0) < Min) {Bad=1} else {Bad=0} if (Bad) $I= Min; $I=log($I) } print $0 } ' - } winLossTie() { local fields=10 local key=1 local performance=$fields local high=1 local confidence=95 local input="-" while [ `echo $1 | grep "-"` ]; do case $1 in -f|--fields) fields=$2; shift 2;; --99) confidence=99; shift 1;; --95) confidence=95; shift 1;; -k|--key) key=$2; shift 2;; -p|--perform) performance=$2; shift 2;; --high) high=1; shift 1;; --low) high=0; shift 1;; -i|--input) input=$2; shift 2;; *) blabln "'"$1"' unknown\n. usage: winLossTie [options]" return 1;; esac done (echo "#key,ties,win,loss,win-loss" gawk -f mwu.awk Fields=$fields Key=$key Performance=$performance \ High=$high Confidence=$confidence $input | sort -t, -r -n -k 5,5 ) | malign } someArff() { #Q7: add command-line options to someArff to control the # names of the generated test/train files (currently # train.arff and test.arff). Remember to define default # values for these variables and to update the help # text. Hand in your new definition of "someArff" local bins=3 local bin=1 local seed=$RANDOM while [ `echo $1 | grep "-"` ]; do case $1 in -B|--bins) bins=$2;; -b|--bin) bin=$2;; -s|--seed) seed=$2;; -h|--help) cat <<-EOF someArff : divide an arrf file into Bins, create train/test files usage: someArff [flags] arffFile Flags -B, --bins NUM Randomly divide the data into NUM bins -b, --bin NUM Store bin NUM into test.arff and rest into train.arff -s, --seed NUM Set the random number seed to NUM -h, --help Print this text EOF return 1;; *) blabln "'"$1"' unknown\n usage cat file | someArff [options]" return 1;; esac shift 2 done gawk ' BEGIN { IGNORECASE=1; Trainf="train.arff"; Testf="test.arff"; Bins=3; Bin=1; Seed=1; } /^[ \t]*$/ { next } /@relation/ { Seed ? srand(Seed) : srand(1) } /@relation/ { printf "">Trainf; printf "">Testf } /@relation/,/@data/ { print $0 >> Trainf; print $0 >> Testf; next } { Line[rand()] = $0; Lines++ } END { ###print Seed Start = Lines/Bins * (Bin - 1) ; Stop = Lines/Bins * Bin; for(I in Line) { N++; What = (N>= Start && N < Stop) ? Testf : Trainf print Line[I]>>What; } } ' Seed=$seed Bins=$bins Bin=$bin - } intersectAttributes() { # list the intersection of attributes # found in a set of arff files gawk ' BEGIN { IGNORECASE=1; OFS=","} FNR==1 { Files++ } /@attribute/ { Got[$2]++ } END { for(A in Got) if (Got[A]>=Files) print A }' $1 } shared() { for i in `intesectAttributes $Ourmine/lib/arffs/mdp/*.arff | sort | grep -v defects`; do echo $i done echo defects } some() { # generate an arff file that only contains certaina attributes gawk -f some.awk -v Some="$1" $2 } makeshare() { Shared=`shared` Sets= "cm1 kc1" for i in $Sets; do echo $i some "$Shared" $Ourmine/lib/arffs/$i.arff > $Tmp/shared/`basename $i` done } quartile() { gawk ' BEGIN { FS = OFS = ","; # # Shrink = 2; # Left = -100; Right = 100; # Off = " "; # Max = "]"; # Min = "["; # Median = "|"; # Low ="-"; High = "+"; F = "5.1f" # #Header = "min,q1,median,q3,max," } { S[++N]=$1 } END {asort(S); if(Header) print Header;report(S,N)} function round(x) { return int(x<0 ? x-0.5 : x+0.5) } function report(s,n, min,q1,median,q3,max) { min = s[1]; q1 = s[int(n/4)]; median= s[int(2*n/4)]; q3 = s[int(3*n/4)]; max = s[n]; printf("%"F",%"F",%"F",%"F",%"F",%s\n", min,q1,median,q3,max, quart(min,q1,median,q3,max,Right,Shrink)) | "sort -t, -r -n -k 2,2" } function quart(min,q1,median,q3,max,width, scale, i,l,str) { width /= scale min /= scale q1 /= scale median /= scale q3 /= scale max /= scale for(i= 1; i<=width; i++) l[int(i)]= Off; for(i=min; i<=q1; i++) l[int(i)]= Low; for(i= q3; i<=max; i++) l[int(i)]= High; l[int(median)] = Median for(i= 1; i<=width; i++) str = str l[int(i)]; return Min str Max } ' - } #### Weka stuff ## pruning columns removeAttributes() { blab "/" $Weka weka.filters.unsupervised.attribute.Remove \ -R "${1}-${2}" -i $3 -o tmp.arff set +x cat tmp.arff } ## discretization discretizeViaFayyadIrani() { blab "x" $Weka weka.filters.supervised.attribute.Discretize \ -c last -R first-last -i $1 -o tmp.arff cat tmp.arff } ## feature subset selection rankViaInfoGain() { blab "<" $Weka weka.filters.supervised.attribute.AttributeSelection \ -S "weka.attributeSelection.Ranker -T -2.7976931348623157E308 -N -1" \ -E "weka.attributeSelection.InfoGainAttributeEval" \ -i $1 -o tmp.arff cat tmp.arff } ### learners ## classifiers # rule-based classifiers oner() { blab "1" $Weka weka.classifiers.rules.OneR \ -B 6 \ -p 0 -t $1 -T $2 } jrip() { blab "j" $Weka weka.classifiers.rules.JRip \ -F 3 -N 2.0 -O 2 -S 1 \ -p 0 -t $1 -T $2 } jrip10() { blab "j" $Weka weka.classifiers.rules.JRip \ -F 3 -N 2.0 -O 2 -S 1 \ -t $1 } part() { blab "p" $Weka weka.classifiers.rules.PART \ -M 2 -C 0.25 -Q 1 -t $1 -T $2 } # bayesian classifiers aode() { blab "a" $Weka weka.classifiers.bayes.AODE \ "-F" 0 \ -p 0 -t $1 -T $2 } aode10() { blab "a" $Weka weka.classifiers.bayes.AODE \ "-F" 0 \ -t $1 -T $2 } nb() { blab "n" $Weka weka.classifiers.bayes.NaiveBayes \ -p 0 -t $1 -T $2 } nb10() { blab "n" $Weka weka.classifiers.bayes.NaiveBayes \ -i -t $1 } nbk() { blab "k" $Weka weka.classifiers.bayes.NaiveBayes \ -K \ -p 0 -t $1 -T $2 } pb() { blab "pb" java -jar PercentileBayes.jar $1 $2 $3 $4 } # decision tree learners j48() { blab "c" $Weka weka.classifiers.trees.J48 \ -C 0.25 -M 2 \ -p 0 -t $1 -T $2 } j4810() { blab "c" $Weka weka.classifiers.trees.J48 \ -C 0.25 -M 2 \ -i -t $1 } j4810c() { blab "c$2" $Weka weka.classifiers.trees.J48 \ -C $2 -M 2 \ -i -t $1 } ## linear-model learners lsr() { blab "L" $Weka weka.classifiers.functions.LinearRegression \ -S 0 -R 1.0E-8 \ -p 0 -t $1 -T $2 } m5p() { blab "P" $Weka weka.classifiers.trees.M5P \ -p 0 -t $1 -T $2 } ## nearest neighbor 1Bkx() { blab "N" $Weka weka.classifiers.lazy.IBk \ -K 1 -W 0 -X -E \ -p 0 -t $1 -T $2 } 1Bk() { blab "n" $Weka weka.classifiers.lazy.IBk \ -K -1 -W 0 -E \ -p 0 -t $1 -T $2 } ## association rule learners apriori() { blab "A" $Weka weka.associations.Apriori \ -N 10 -T 0 -C 0.9 -D 0.05 -U 1.0 -M 0.1 -S -1.0 \ -p 0 -t $1 -T $2 } #### teaching demos weather.nominal() { cat<<-EOF @relation weather.nominal @attribute outlook {sunny, overcast, rainy} @attribute temperature {hot, mild, cool} @attribute humidity {high, normal} @attribute windy {TRUE, FALSE} @attribute play {yes, no} @data sunny,hot,high,FALSE,no sunny,hot,high,TRUE,no overcast,hot,high,FALSE,yes rainy,mild,high,FALSE,yes rainy,cool,normal,FALSE,yes rainy,cool,normal,TRUE,no overcast,cool,normal,TRUE,yes sunny,mild,high,FALSE,no sunny,cool,normal,FALSE,yes rainy,mild,normal,FALSE,yes sunny,mild,normal,TRUE,yes overcast,mild,high,TRUE,yes overcast,hot,normal,FALSE,yes rainy,mild,high,TRUE,no EOF } auto93() { cat<<-EOF @relation 'auto93.names' @attribute Manufacturer { Acura, Audi, BMW, Buick, Cadillac, Chevrolet, Chrysler, Dodge, Eagle, Ford, Geo, Honda, Hyundai, Infiniti, Lexus, Lincoln, Mazda, Mercedes-Benz, Mercury, Mitsubishi, Nissan, Oldsmobile, Plymouth, Pontiac, Saab, Saturn, Subaru, Suzuki, Toyota, Volkswagen, Volvo} @attribute Type { Small, Midsize, Compact, Large, Sporty, Van} @attribute City_MPG real @attribute Highway_MPG real @attribute Air_Bags_standard { 0, 2, 1} @attribute Drive_train_type { 1, 0, 2} @attribute Number_of_cylinders real @attribute Engine_size real @attribute Horsepower real @attribute RPM real @attribute Engine_revolutions_per_mile real @attribute Manual_transmission_available { 1, 0} @attribute Fuel_tank_capacity real @attribute Passenger_capacity real @attribute Length real @attribute Wheelbase real @attribute Width real @attribute U-turn_space real @attribute Rear_seat_room real @attribute Luggage_capacity real @attribute Weight real @attribute Domestic { 0, 1} @attribute class real @data Acura,Small,25,31,0,1,4,1.8,140,6300,2890,1,13.2,5,177,102,68,37,26.5,11,2705,0,15.9 Acura,Midsize,18,25,2,1,6,3.2,200,5500,2335,1,18,5,195,115,71,38,30,15,3560,0,33.9 Audi,Compact,20,26,1,1,6,2.8,172,5500,2280,1,16.9,5,180,102,67,37,28,14,3375,0,29.1 Audi,Midsize,19,26,2,1,6,2.8,172,5500,2535,1,21.1,6,193,106,70,37,31,17,3405,0,37.7 BMW,Midsize,22,30,1,0,4,3.5,208,5700,2545,1,21.1,4,186,109,69,39,27,13,3640,0,30 Buick,Midsize,22,31,1,1,4,2.2,110,5200,2565,0,16.4,6,189,105,69,41,28,16,2880,1,15.7 Buick,Large,19,28,1,1,6,3.8,170,4800,1570,0,18,6,200,111,74,42,30.5,17,3470,1,20.8 Buick,Large,16,25,1,0,6,5.7,180,4000,1320,0,23,6,216,116,78,45,30.5,21,4105,1,23.7 Buick,Midsize,19,27,1,1,6,3.8,170,4800,1690,0,18.8,5,198,108,73,41,26.5,14,3495,1,26.3 Cadillac,Large,16,25,1,1,8,4.9,200,4100,1510,0,18,6,206,114,73,43,35,18,3620,1,34.7 Cadillac,Midsize,16,25,2,1,8,4.6,295,6000,1985,0,20,5,204,111,74,44,31,14,3935,1,40.1 Chevrolet,Compact,25,36,0,1,4,2.2,110,5200,2380,1,15.2,5,182,101,66,38,25,13,2490,1,13.4 Chevrolet,Compact,25,34,1,1,4,2.2,110,5200,2665,1,15.6,5,184,103,68,39,26,14,2785,1,11.4 Chevrolet,Sporty,19,28,2,0,6,3.4,160,4600,1805,1,15.5,4,193,101,74,43,25,13,3240,1,15.1 Chevrolet,Midsize,21,29,0,1,4,2.2,110,5200,2595,0,16.5,6,198,108,71,40,28.5,16,3195,1,15.9 Chevrolet,Van,18,23,0,1,6,3.8,170,4800,1690,0,20,7,178,110,74,44,30.5,?,3715,1,16.3 Chevrolet,Van,15,20,0,2,6,4.3,165,4000,1790,0,27,8,194,111,78,42,33.5,?,4025,1,16.6 Chevrolet,Large,17,26,1,0,8,5,170,4200,1350,0,23,6,214,116,77,42,29.5,20,3910,1,18.8 Chevrolet,Sporty,17,25,1,0,8,5.7,300,5000,1450,1,20,2,179,96,74,43,?,?,3380,1,38 Chrysler,Large,20,28,2,1,6,3.3,153,5300,1990,0,18,6,203,113,74,40,31,15,3515,1,18.4 Chrysler,Compact,23,28,2,1,4,3,141,5000,2090,0,16,6,183,104,68,41,30.5,14,3085,1,15.8 Chrysler,Large,20,26,1,1,6,3.3,147,4800,1785,0,16,6,203,110,69,44,36,17,3570,1,29.5 Dodge,Small,29,33,0,1,4,1.5,92,6000,3285,1,13.2,5,174,98,66,32,26.5,11,2270,1,9.2 Dodge,Small,23,29,1,1,4,2.2,93,4800,2595,1,14,5,172,97,67,38,26.5,13,2670,1,11.3 Dodge,Compact,22,27,1,1,4,2.5,100,4800,2535,1,16,6,181,104,68,39,30.5,14,2970,1,13.3 Dodge,Van,17,21,1,2,6,3,142,5000,1970,0,20,7,175,112,72,42,26.5,?,3705,1,19 Dodge,Midsize,21,27,1,1,4,2.5,100,4800,2465,0,16,6,192,105,69,42,30.5,16,3080,1,15.6 Dodge,Sporty,18,24,1,2,6,3,300,6000,2120,1,19.8,4,180,97,72,40,20,11,3805,1,25.8 Eagle,Small,29,33,0,1,4,1.5,92,6000,2505,1,13.2,5,174,98,66,36,26.5,11,2295,1,12.2 Eagle,Large,20,28,2,1,6,3.5,214,5800,1980,0,18,6,202,113,74,40,30,15,3490,1,19.3 Ford,Small,31,33,0,1,4,1.3,63,5000,3150,1,10,4,141,90,63,33,26,12,1845,1,7.4 Ford,Small,23,30,0,1,4,1.8,127,6500,2410,1,13.2,5,171,98,67,36,28,12,2530,1,10.1 Ford,Compact,22,27,0,1,4,2.3,96,4200,2805,1,15.9,5,177,100,68,39,27.5,13,2690,1,11.3 Ford,Sporty,22,29,1,0,4,2.3,105,4600,2285,1,15.4,4,180,101,68,40,24,12,2850,1,15.9 Ford,Sporty,24,30,1,1,4,2,115,5500,2340,1,15.5,4,179,103,70,38,23,18,2710,1,14 Ford,Van,15,20,1,2,6,3,145,4800,2080,1,21,7,176,119,72,45,30,?,3735,1,19.9 Ford,Midsize,21,30,1,1,6,3,140,4800,1885,0,16,5,192,106,71,40,27.5,18,3325,1,20.2 Ford,Large,18,26,1,0,8,4.6,190,4200,1415,0,20,6,212,114,78,43,30,21,3950,1,20.9 Geo,Small,46,50,0,1,3,1,55,5700,3755,1,10.6,4,151,93,63,34,27.5,10,1695,0,8.4 Geo,Sporty,30,36,1,1,4,1.6,90,5400,3250,1,12.4,4,164,97,67,37,24.5,11,2475,0,12.5 Honda,Sporty,24,31,2,1,4,2.3,160,5800,2855,1,15.9,4,175,100,70,39,23.5,8,2865,0,19.8 Honda,Small,42,46,1,1,4,1.5,102,5900,2650,1,11.9,4,173,103,67,36,28,12,2350,0,12.1 Honda,Compact,24,31,2,1,4,2.2,140,5600,2610,1,17,4,185,107,67,41,28,14,3040,0,17.5 Hyundai,Small,29,33,0,1,4,1.5,81,5500,2710,1,11.9,5,168,94,63,35,26,11,2345,0,8 Hyundai,Small,22,29,0,1,4,1.8,124,6000,2745,1,13.7,5,172,98,66,36,28,12,2620,0,10 Hyundai,Sporty,26,34,0,1,4,1.5,92,5550,2540,1,11.9,4,166,94,64,34,23.5,9,2285,0,10 Hyundai,Midsize,20,27,0,1,4,2,128,6000,2335,1,17.2,5,184,104,69,41,31,14,2885,0,13.9 Infiniti,Midsize,17,22,1,0,8,4.5,278,6000,1955,0,22.5,5,200,113,72,42,29,15,4000,0,47.9 Lexus,Midsize,18,24,1,1,6,3,185,5200,2325,1,18.5,5,188,103,70,40,27.5,14,3510,0,28 Lexus,Midsize,18,23,2,0,6,3,225,6000,2510,1,20.6,4,191,106,71,39,25,9,3515,0,35.2 Lincoln,Midsize,17,26,2,1,6,3.8,160,4400,1835,0,18.4,6,205,109,73,42,30,19,3695,1,34.3 Lincoln,Large,18,26,2,0,8,4.6,210,4600,1840,0,20,6,219,117,77,45,31.5,22,4055,1,36.1 Mazda,Small,29,37,0,1,4,1.6,82,5000,2370,1,13.2,4,164,97,66,34,27,16,2325,0,8.3 Mazda,Small,28,36,0,1,4,1.8,103,5500,2220,1,14.5,5,172,98,66,36,26.5,13,2440,0,11.6 Mazda,Compact,26,34,1,1,4,2.5,164,5600,2505,1,15.5,5,184,103,69,40,29.5,14,2970,0,16.5 Mazda,Van,18,24,0,2,6,3,155,5000,2240,0,19.6,7,190,110,72,39,27.5,?,3735,0,19.1 Mazda,Sporty,17,25,1,0,?,1.3,255,6500,2325,1,20,2,169,96,69,37,?,?,2895,0,32.5 Mercedes-Benz,Compact,20,29,1,0,4,2.3,130,5100,2425,1,14.5,5,175,105,67,34,26,12,2920,0,31.9 Mercedes-Benz,Midsize,19,25,2,0,6,3.2,217,5500,2220,0,18.5,5,187,110,69,37,27,15,3525,0,61.9 Mercury,Sporty,23,26,1,1,4,1.6,100,5750,2475,1,11.1,4,166,95,65,36,19,6,2450,1,14.1 Mercury,Midsize,19,26,0,0,6,3.8,140,3800,1730,0,18,5,199,113,73,38,28,15,3610,1,14.9 Mitsubishi,Small,29,33,0,1,4,1.5,92,6000,2505,1,13.2,5,172,98,67,36,26,11,2295,0,10.3 Mitsubishi,Midsize,18,24,1,1,6,3,202,6000,2210,0,19,5,190,107,70,43,27.5,14,3730,0,26.1 Nissan,Small,29,33,1,1,4,1.6,110,6000,2435,1,13.2,5,170,96,66,33,26,12,2545,0,11.8 Nissan,Compact,24,30,1,1,4,2.4,150,5600,2130,1,15.9,5,181,103,67,40,28.5,14,3050,0,15.7 Nissan,Van,17,23,0,1,6,3,151,4800,2065,0,20,7,190,112,74,41,27,?,4100,0,19.1 Nissan,Midsize,21,26,1,1,6,3,160,5200,2045,0,18.5,5,188,104,69,41,28.5,14,3200,0,21.5 Oldsmobile,Compact,24,31,0,1,4,2.3,155,6000,2380,0,15.2,5,188,103,67,39,28,14,2910,1,13.5 Oldsmobile,Midsize,23,31,1,1,4,2.2,110,5200,2565,0,16.5,5,190,105,70,42,28,16,2890,1,16.3 Oldsmobile,Van,18,23,0,1,6,3.8,170,4800,1690,0,20,7,194,110,74,44,30.5,?,3715,1,19.5 Oldsmobile,Large,19,28,1,1,6,3.8,170,4800,1570,0,18,6,201,111,74,42,31.5,17,3470,1,20.7 Plymouth,Sporty,23,30,0,2,4,1.8,92,5000,2360,1,15.9,4,173,97,67,39,24.5,8,2640,1,14.4 Pontiac,Small,31,41,0,1,4,1.6,74,5600,3130,1,13.2,4,177,99,66,35,25.5,17,2350,1,9 Pontiac,Compact,23,31,0,1,4,2,110,5200,2665,1,15.2,5,181,101,66,39,25,13,2575,1,11.1 Pontiac,Sporty,19,28,2,0,6,3.4,160,4600,1805,1,15.5,4,196,101,75,43,25,13,3240,1,17.7 Pontiac,Midsize,19,27,0,1,6,3.4,200,5000,1890,1,16.5,5,195,108,72,41,28.5,16,3450,1,18.5 Pontiac,Large,19,28,2,1,6,3.8,170,4800,1565,0,18,6,177,111,74,43,30.5,18,3495,1,24.4 Saab,Compact,20,26,1,1,4,2.1,140,6000,2910,1,18,5,184,99,67,37,26.5,14,2775,0,28.7 Saturn,Small,28,38,1,1,4,1.9,85,5000,2145,1,12.8,5,176,102,68,40,26.5,12,2495,1,11.1 Subaru,Small,33,37,0,2,3,1.2,73,5600,2875,1,9.2,4,146,90,60,32,23.5,10,2045,0,8.4 Subaru,Small,25,30,0,2,4,1.8,90,5200,3375,1,15.9,5,175,97,65,35,27.5,15,2490,0,10.9 Subaru,Compact,23,30,1,2,4,2.2,130,5600,2330,1,15.9,5,179,102,67,37,27,14,3085,0,19.5 Suzuki,Small,39,43,0,1,3,1.3,70,6000,3360,1,10.6,4,161,93,63,34,27.5,10,1965,0,8.6 Toyota,Small,32,37,1,1,4,1.5,82,5200,3505,1,11.9,5,162,94,65,36,24,11,2055,0,9.8 Toyota,Sporty,25,32,1,1,4,2.2,135,5400,2405,1,15.9,4,174,99,69,39,23,13,2950,0,18.4 Toyota,Midsize,22,29,1,1,4,2.2,130,5400,2340,1,18.5,5,188,103,70,38,28.5,15,3030,0,18.2 Toyota,Van,18,22,1,2,4,2.4,138,5000,2515,1,19.8,7,187,113,71,41,35,?,3785,0,22.7 Volkswagen,Small,25,33,0,1,4,1.8,81,5500,2550,1,12.4,4,163,93,63,34,26,10,2240,0,9.1 Volkswagen,Van,17,21,0,1,5,2.5,109,4500,2915,1,21.1,7,187,115,72,38,34,?,3960,0,19.7 Volkswagen,Compact,21,30,0,1,4,2,134,5800,2685,1,18.5,5,180,103,67,35,31.5,14,2985,0,20 Volkswagen,Sporty,18,25,0,1,6,2.8,178,5800,2385,1,18.5,4,159,97,66,36,26,15,2810,0,23.3 Volvo,Compact,21,28,1,0,4,2.3,114,5400,2215,1,15.8,5,190,104,67,37,29.5,14,2985,0,22.7 Volvo,Midsize,20,28,2,1,5,2.4,168,6200,2310,1,19.3,5,184,105,69,38,30,15,3245,0,26.7 EOF } auto93discreteClass() { #some learners can't handle auto93's numeric class #so we discretize the class. Note that this is a pretty # dumb discretizer. auto93 | gawk 'BEGIN {IGNORECASE=1; OFS=","; Round=20} In && NF > 1 {$NF= "_"int($NF/Round+0.5)*Round} $2 =="class" {$3 = "{_0,_20,_40,_60}"} /@data/ {In=1; FS=","} { print}' } #### some workers worker1001() { for one in $Data; do cp $one raw.arff stem=`basename $one` stem=${stem/.*/} cat raw.arff | logNumbers arff > logged.arff discretizeViaFayyadIrani raw.arff > discrete.arff discretizeViaFayyadIrani logged.arff > loggedDiscrete.arff for x in raw discrete logged loggedDiscrete; do rankViaInfoGain $x.arff > ranked.arff for Attrs in 4 7 13 16; do removeAttributes $Attrs 16 $x.arff > ranked${Attrs}.arff blab "$stem $x $Attrs " echo "#file,x,attrs,bin,learner,a,b,c,d,acc,pd,pf,prec,g" for((R=1;R<=$Repeats;R++)); do Seed=$RANDOM for((Bin=1; Bin <= $Bins ; Bin++)); do blab "$Bin" makeTrainTest $Seed $Bins $Bin ranked${Attrs}.arff for Learner in $Learners; do $Learner train.arff test.arff | gotwant | abcd "$stem,$x,$Attrs,$Bin,$Learner" done done done | medians blabln done done done } worker1002() { for one in $Data; do cp $one raw.arff stem=`basename $one` stem=${stem/.*/} logNumbers raw.arff > logged.arff discretizeViaFayyadIrani raw.arff > discrete.arff discretizeViaFayyadIrani logged.arff > loggedDiscrete.arff for x in discrete loggedDiscrete; do rankViaInfoGain $x.arff > ranked.arff for Attrs in 4 7 13 16; do removeAttributes $Attrs 16 $x.arff > ranked${Attrs}.arff blab "$stem $x $Attrs " Seed=$RANDOM echo "#file,x,attrs,bin,learner,a,b,c,d,acc,pd,pf,prec,g" for((R=1;R<=$Repeats;R++)); do for((Bin=1; Bin <= $Bins ; Bin++)); do blab "." makeTrainTest $Seed $Bins $Bin ranked${Attrs}.arff for Learner in $Learners; do $Learner train.arff test.arff | gotwant | abcd "$stem,$x,$Attrs,$Bin,$Learner" done done done | medians blabln done done done } #### some demos demo3() { cd $Tmp weather.nominal > data.arff j4810 data.arff cd $Here } demo4() { cd $Tmp weather.nominal > data.arff j4810 data.arff | report 4 3 cd $Here } demo5() { cd $Tmp weather.nominal > data.arff j4810 data.arff | report 4 3,18,16 cd $Here } demo5a() { j4810 $Ourmine/lib/arffs/uci/discrete/soybean.arff } demo6() { cd $Tmp weather.nominal > data.arff nb10 data.arff cd $Here } demo7() { cd $Tmp weather.nominal > data.arff nb10 data.arff | report 4 2,3,4,5 cd $Here } demo8() { cd $Tmp weather.nominal > data.arff nb10 data.arff | report 4 18,16 cd $Here } demo9() { cd $Tmp weather.nominal > data.arff j48 data.arff data.arff cd $Home } demo10() { demo9 | gotwant } demo11() { demo10 | abcd --before "\na,b,c,d,acc,pd,pf,prec,bal\n" --decimals 1 | malign } demo12() { cd $Tmp auto93discreteClass > data.arff j48 data.arff data.arff | gotwant cd $Here } demo13() { demo12 | gawk -F, '$1 != $2' } demo14() { demo12 | for goal in _0 _20 _40 ; do abcd --goal "$goal" \ --prefix "auto93d,$goal" \ --before "\n#data,goal,a,b,c,d,acc,pd,pf,prec,bal\n" \ --decimals 1 done | malign } demo15() { cd $Tmp ( echo "#data,bin, a,b,c,d,acc,pd,pf,prec,bal" seed=$RANDOM; for((bin=1;bin<=10;bin++)); do blab "$bin" auto93discreteClass | someArff --seed $seed --bins $Bins --bin $bin j48 train.arff test.arff | gotwant | abcd --goal "_20" --prefix "auto93,$bin" --decimals 1 done | sort -t, -n -k 11,11 ) | malign > demo15.csv #each bin blabln " " echo ""; cat demo15.csv cp demo15.csv $Safe/demo15.csv cd $Here } demo16() { cd $Tmp ( echo "#data,repeat,bin,a,b,c,d,acc,pd,pf,prec,bal" for((r=1;r<=2;r++)); do blab "repeat=$r " seed=$RANDOM; for((bin=1;bin<=5;bin++)); do blab "$bin" auto93discreteClass | someArff --seed $seed --bins $Bins --bin $bin j48 train.arff test.arff | gotwant | abcd --goal "_20" --prefix "auto93,$r,$bin" --decimals 1 done blabln done | sort -t, -n -k 12,12 ) | malign > demo15.csv #each bin blabln " " echo ""; cat demo15.csv cp demo15.csv $Safe/demo15.csv cd $Here } demo17() { local me=demo17 local bins=10 local repeats=2 local learners="oner nb j48" local datas="diabetes autos" cd $Tmp (echo "#data,repeat,bin,learner,goal,a,b,c,d,acc,pd,pf,prec,bal" for data in $datas; do arff=$Ourmine/lib/arffs/uci/discrete/$data.arff for((r=1;r<=$repeats;r++)); do blab "data=$data repeat=$r " seed=$RANDOM; for((bin=1;bin<=$bins;bin++)); do blab "$bin" cat $arff | someArff --seed $seed --bins $bins --bin $bin goals=`cat $arff | classes --brief` for learner in $learners; do $learner train.arff test.arff | gotwant > results.dat for goal in $goals; do cat results.dat | abcd --goal "$goal" \ --prefix "$data,$r,$bin,$learner,$goal" \ --decimals 1 done done done blabln done done ) | sort -t, -n -k 14,14 | malign > $me.csv #each bin blabln " " echo ""; cat $me.csv cp $me.csv $Safe/$me.csv cd $Here } demo18() { local stats="$Safe/demo17.csv" local learners="nb j48 oner" local datas="diabetes autos" [ ! -f "$stats" ] && demo17 (echo "#data,learner,goal,a,b,c,d,acc,pd,pf,prec,bal" for data in $datas; do for learner in $learners; do echo -n "$data,$learner, " grep $data $stats | grep $learner | medians --start 6 | grep "##" done done ) | malign | sort -t, -n -k 12,12 } demo19() { local stats="$Safe/demo17.csv" [ ! -f "$stats" ] && demo17 winLossTie --input $stats --fields 14 --perform 14 --key 4 --95 --high } demoSample() { local me=demoSample local bins=10 local repeats=10 local learners="nb j48" local datas="cm1 kc1" cd $Tmp (echo "#data,repeat,bin,sample,learner,goal,a,b,c,d,acc,pd,pf,prec,bal" for data in $datas; do arff=$Ourmine/lib/arffs/mdp/$data.arff cat $arff | logNumbers > logged.arff discretizeViaFayyadIrani logged.arff > loggedDiscrete.arff for((r=1;r<=$repeats;r++)); do blab "data=$data repeat=$r " seed=$RANDOM; for((bin=1;bin<=$bins;bin++)); do blab "$bin" cat loggedDiscrete.arff | someArff --seed $seed --bins $bins --bin $bin for s in over under raw; do sample train.arff true $s goals="true" for learner in $learners; do $learner sampled.arff test.arff | gotwant > results.dat for goal in $goals; do cat results.dat | abcd --goal "$goal" \ --prefix "$data,$r,$bin,$s,$learner,$goal" \ --decimals 1 done done done done blabln done done) | sort -t, -n -k 15,15 | malign > $me.csv #each bin blabln " " echo ""; cat $me.csv cp $me.csv $Safe/$me.csv cd $Here } addAliensData(){ gawk -v F=$1 -v Seed=$RANDOM -v Replace=$2 -v AlsoReplace=$3 -v Fill=$4 ' BEGIN { lines=0; srand(Seed); while(getline x < F) { if((x !~ /^[ \t]*$/)&&(x !~ /@attribute/)&&(x !~ /@data/)&&(x!="")&&(x !~ /@relation/)&&(x !~ /^%/)) { filelines[lines]=x lines++; } else if(x ~ /@attribute/) { attcount++; print x } else print x } close(F); for(j=0;j<=lines;j++) { if(j<(lines/2)) print filelines[j]; else{ if(j%2==0) { outline="" split(filelines[j],thisline,","); for(k=1;k=0; i--) { x=int(rand()*1000); while(x>(lines-5)) { x = x/2; } if(file1lines[x] !="") { print file1lines[x]; } else i++; } } ' } demoCombine() { combineFiles $Ourmine/lib/arffs/uci/discrete/soybean.arff $Ourmine/lib/arffs/uci/discrete/soybean.arff > $Safe/soybean/combined.arff } combineFiles() { gawk -v F=$1 -v G=$2 -v Seed=$RANDOM ' BEGIN { lines=0; lines2=0; srand(Seed); while(getline x < F) { if((x !~ /^[ \t]*$/)&&(x !~ /@attribute/)&&(x !~ /@data/)&&(x!="")&&(x !~ /@relation/)&&(x !~ /^%/)) { file1lines[lines]=x lines++; } else print x } close(F); while(getline y < G) { if((y !~ /^[ \t]*$/)&&(y !~ /@attribute/)&&(y !~ /@data/)&&(y!="")&&(y !~ /@relation/)&&(y !~ /^%/)) { file2lines[lines2]=x lines2++; } } close(G); for(i=lines; i>=0; i--) { x=int(rand()*1000); while(x>lines) { x = x/2; } if(file1lines[x] !="") { print file1lines[x]; } else i++; if((lines2>0)&&(x y ? x : y } function min(x,y) { return x < y ? x : y } function sd(sumSq,sumX,n) { return sqrt((sumSq-((sumX*sumX)/n))/(n-1)); }' } stats() { gawk -v F=$1 ' BEGIN { acount=0; lcount=0; while(getline x < F) { if(x ~ /@attribute/) { split(x,line," "); if((line[3] ~ numeric)||(line[3] ~ real)) atts[acount]="n"; else atts[acount]="d"; acount++; } else if((x !~ /^[ \t]*$/)&&(x !~ /@attribute/)&&(x !~ /@data/)&&(x!="")&&(x !~ /@relation/)&&(x !~ /^%/)) { split(x,line,","); lcount++; for(i=1;i<=acount;i++){ ##All attributes ucount[i]=0; qcount[i]=0; if((lcount==1)&&(line[i] !~ /^?/)) { ucount[i]++; uniques[i","ucount[1]]=line[1]; } else if(line[i] ~ /^?/) { qcount[i]++; } else if(line[i] !~ /^?/) { isu=true; for(j in uniques) { if(uniques[j]==line[i]) isu=false; } if(isu==true) { ucount[i]++; uniques[i","ucount[i]]=line[i]; } } } } } close(F); for(k=1;k<=acount;k++){ print "Uniques in column "k":"; } } ' } demoCohen() { local datas="iris sonar weather segment splice audiology vote mushroom labor" ##anneal autos breast-cancer colic diabetes glass" local me="cohenResults" local bins=10 local repeats="10" local learners="j48 jrip part" cd $Tmp (echo "#data,repeat,bin,learner,goal,a,b,c,d,acc,pd,pf,prec,bal" for data in $datas; do arff=$Ourmine/lib/arffs/uci/discrete/$data.arff for((r=1;r<=$repeats;r++)); do blab "data=$data repeat=$r " seed=$RANDOM; for((bin=1;bin<=$bins;bin++)); do blab "$bin" cat $arff | someArff --seed $seed --bins $bins --bin $bin --train train.arff --test test.arff goals==`cat $arff | classes --brief` for learner in $learners; do $learner train.arff test.arff | gotwant > results.dat for goal in $goals; do cat results.dat | tee myfile.dat | abcd --goal "$goal" \ --prefix "$data,$r,$bin,$learner,$goal" \ --decimals 1 done done done blabln done done) | sort -t, -n -k 14,14 | malign > $me.csv #each bin blabln " " echo ""; cat $me.csv cp $me.csv $Safe/$me.csv winLossTie --input $Safe/$me.csv --fields 14 --perform 14 --key 4 --95 --high cd $Here } holteWorker() { local datas="credit-g primary-tumor" ##"anneal audiology autos credit-a credit-g diabetes heart-h ionosphere letter primary-tumor segment sonar splice vehicle vowel waveform-5000" ##"v1" ##"iris soybean" ##"hepatitis labor lymph" ##"glass g2" #####"breast-cancer kr-vs-kp heart-c colic hypothyroid mushroom sick vote" for data in $datas; do ##processVotes $Ourmine/lib/arffs/uci/discrete/$data.arff > $Tmp/v$data.arff processA $Ourmine/lib/arffs/uci/discrete/$data.arff > $Tmp/p$data.arff demoHolte $Tmp/p$data.arff $data > $Safe/holte$data done } processA(){ gawk -f $Here/lib/processArffs.awk -v F="$1" } processVotes(){ gawk -v F=$1 ' BEGIN { while(getline x < F) { if((x !~ /^[ \t]*$/)&&(x !~ /@attribute/)&&(x !~ /@data/)&&(x!="")&&(x !~ /@relation/)&&(x !~ /^%/)) { split(x,line,","); count=0; for(c in line) { count++; } out=line[1] for(i=2;i<=count;i++) { if(i!=4) out=out","line[i]; } print out } else print x } close(F); } ' } processGlass(){ gawk -v F=$1 -v S=$2 ' BEGIN{ if(S=="glass") { while(getline x < F) { if((x !~ /^[ \t]*$/)&&(x !~ /@attribute/)&&(x !~ /@data/)&&(x!="")&&(x !~ /@relation/)&&(x !~ /^%/)) { split(x,line,","); count=0; for(c in line) { count++; } out=line[2] for(i=3;i<=count;i++) { out=out","line[i]; } print out } else print x } close(F); } else { while(getline x < F) { if((x !~ /^[ \t]*$/)&&(x !~ /@attribute/)&&(x !~ /@data/)&&(x!="")&&(x !~ /@relation/)&&(x !~ /^%/)) { split(x,line,","); count=0; for(c in line) { count++; } if((x ~ /buildwindfloat/)||(x ~ /buildwindnon-float/)||(x ~ /vehicwindfloat/)) { if(x ~ /vehicwindfloat/) { line[count]="buildwindfloat" } out=line[1]; for(i=2;i<=count;i++) { out=out","line[i]; } print out } else print x > /dev/null } else print x } close(F); } } ' } demoHolte() { local me="holteResults" local bins=3 local repeats="25" local learners="oner j48" cd $Tmp for learner in $learners; do (echo "#data,repeat,bin,learner,goal,a,b,c,d,acc,pd,pf,prec,bal" arff=$1 for((r=1;r<=$repeats;r++)); do blab "data=$data repeat=$r " seed=$RANDOM; ##for((bin=1;bin<=$bins;bin++)); do ## blab "$bin" cat $arff | someArff --seed $seed --bins $bins --bin 1 --train train.arff --test test.arff goals=`cat $arff | classes --brief` ###for learner in $learners; do $learner train.arff test.arff | gotwant > results.dat for goal in $goals; do cat results.dat | abcd --goal "$goal" \ --prefix "$data,$r,$bin,$learner,$goal" \ --decimals 1 done ## done ## done blabln done ) | sort -t, -n -k 10,10 | malign > $me.csv #each bin blabln " " echo ""; cat $me.csv | medians --start 2 | malign cp $me.csv $Safe/$me$2.csv cd $Here done } demoEffort() { local learners="nb" local datas="cm1" cd $Tmp (for data in $datas; do arff=$Ourmine/lib/arffs/mdp/$data.arff addUniqueID $arff > $Tmp/dataWithIDs.arff prepForSums $Tmp/dataWithIDs.arff > $Tmp/dataForSums.arff seed=$RANDOM; for ((b=1;b<=10;b+=1)); do cat $Tmp/dataWithIDs.arff | someArff --train $Tmp/rest$b.arff --test $Tmp/test$b.arff --bin $b --bins 10 --seed $seed for ((c=1;c<=10;c+=1)); do if [ $c != $b ]; then seed=$RANDOM cat $Tmp/rest$b.arff | someArff --train $Tmp/train$b$c.arff --test $Tmp/ignored.arff --bin $c --bins 10 --seed $seed blabln $b","$c $Weka weka.classifiers.bayes.NaiveBayes -p 1 -t $Tmp/train$b$c.arff -T $Tmp/test$b.arff > $Tmp/dataToBeMarked$b$c markInstances $Tmp/dataToBeMarked$b$c $Tmp/test$b.arff > markedData$b$c.arff fi done for ((d=1;d<=10;d+=1)); do current=$d let "current -= 1" if (( ("$b" == "1") && ("$d" == "1") )); then echo " " elif (( ("$b" =="1") && ("$d" == "2") )); then sumInstances $Tmp/markedData$b$d.arff $Tmp/dataForSums.arff > $Tmp/dataForSums$b$d.arff elif [ $d -eq "1" ]; then sumInstances $Tmp/markedData$b$d.arff $Tmp/dataForSums.arff > $Tmp/dataForSums$b$d.arff elif [ $d != $b ]; then if (( "$d" == ("$b"+1) )); then let "current -= 1" fi sumInstances $Tmp/markedData$b$d.arff $Tmp/dataForSums$b$current.arff > $Tmp/dataForSums$b$d.arff fi done for ((threshold=1;threshold<=9;threshold+=1)); do t=10 if [ $b != $t ]; then labelInstances $Tmp/dataForSums$b$t.arff $threshold > labeledData$b$threshold.arff echo "a,b,c,d,effort,acc,pd,pf,prec,bal" | malign effortGotWant labeledData$b$threshold.arff | abcde --goal "defective" --input labeledData$b$threshold.arff | malign else labelInstances $Tmp/dataForSums109.arff $threshold > labeledData$b$threshold.arff echo "a,b,c,d,effort,acc,pd,pf,prec,bal" | malign effortGotWant labeledData$b$threshold.arff | abcde --goal "defective" --input labeledData$b$threshold.arff | malign fi done done done) } abcde() { local goal="true|yes" local before="" local prefix="" local decimals=2 local input="" while [ `echo $1 | grep "-"` ]; do case $1 in -d|--decimals) decimals=$2;; -b|--before) before=$2;; -p|--prefix) prefix=$2;; -g|--goal) goal=$2;; -i|--input) input=$2;; *) blabln "'"$1"' unknown\n usage abcd [options]"; return 1;; esac shift 2 done [ -n "$before" ] && printf $before gawk -v F=$input ' BEGIN { Decimals = 3 Got = 1 Want = 2; Prefix = ""; True = "true"; ## define symbol 1 A=B=C=D=Effort=0 ; FS=OFS="," GoalPd = 1; GoalPf = 0; Contents[0]=0; } function yes(s) {return s ~ True } function no(s) {return ( yes(s) ? 0 : 1 ) } { sub(/#.*/,"") } /^[ \t]*$/ { next } NF==2 { N++; Predicted=$Got; Actual=$Want; if (Predicted == Actual) Good++; if (no( Actual) && no( Predicted)) { A++; Contents[N]="a"; } if (yes(Actual) && no( Predicted)) { B++; Contents[N]="b"; } if (no( Actual) && yes(Predicted)) { C++; Contents[N]="c"; } if (yes(Actual) && yes(Predicted)) { D++; Contents[N]="d"; } #print N,$0,A,B,C,D } END { locodea=locodeb=locodec=locoded=0; while(getline x < F){ if(x ~ /@attribute/) { attCount++; split(x,att," "); if(att[2]=="lOCode") loc=attCount; } else if((x !~ /^[ \t]*$/)&&(x !~ /@attribute/)&&(x !~ /@data/)&&(x!="")&&(x !~ /@relation/)&&(x !~ /^%/)) { count++ split(x,words,","); if(Contents[count]=="a") locodea+=words[loc]; else if(Contents[count]=="b") locodeb+=words[loc]; else if(Contents[count]=="c") locodec+=words[loc]; else if(Contents[count]=="d") locoded+=words[loc]; } } close(F); Effort = (locodec+locoded)/(locodea+locodeb+locodec+locoded); OFMT = "%." Decimals "f"; Balance=Precision=Accuracy=Pf=NotPf=Pd=0; if (C+D > 0 ) Precision = D/(C+D); if ((A+B+C+D) > 0) Accuracy = (A+D)/(A+B+C+D); if (A+C > 0 ) Pf = C/(A+C) if (B+D > 0 ) Pd = D/(B+D); if (B+C+D > 0) { # special case- everything misses Balance = 1 - sqrt((GoalPd - Pd)^2 + (GoalPf - Pf)^2)/sqrt(2) } if(Prefix) printf Txt=Prefix OFS; print A,B,C,D, sprintf(OFMT,100*Effort), sprintf(OFMT,100*Accuracy), sprintf(OFMT,100*Pd), sprintf(OFMT,100*Pf), sprintf(OFMT,100*Precision), sprintf(OFMT,100*Balance); }' Prefix="$prefix" Decimals="$decimals" True="$goal" - } effortGotWant() { gawk -v F=$1 ' BEGIN { while(getline x < F) { if((x !~ /^[ \t]*$/)&&(x !~ /@attribute/)&&(x !~ /@data/)&&(x!="")&&(x !~ /@relation/)&&(x !~ /^%/)) { split(x,words,","); count=0; for(f in words) count++; if(words[count]=="true") actual="defective"; else actual="notDefective"; print words[1]","actual; } } close(F); } ' } labelInstances() { gawk -v F=$1 -v T=$2 ' BEGIN { while(getline x < F) { if(x ~ /@relation/) { print x print "" print "@attribute defective {defective,notDefective}" skip=1 } else if (skip==1) { print x > /dev/null; skip=0; } else if((x !~ /^[ \t]*$/)&&(x !~ /@attribute/)&&(x !~ /@data/)&&(x!="")&&(x !~ /@relation/)&&(x !~ /^%/)) { split(x,line,","); count=0; for(c in line) { count++; } if(line[1]>=T) out="defective" else out="notDefective" for(i=1;i<=count;i++) { out=out","line[i]; } print out } else print x } close(F); } ' } sumInstances() { gawk -v F=$1 -v O=$2 -v Debug=$Tmp/debug ' BEGIN { count=1; while(getline x < F) { if((x !~ /^[ \t]*$/)&&(x !~ /@attribute/)&&(x !~ /@data/)&&(x!="")&&(x !~ /@relation/)&&(x !~ /^%/)) { split(x,datas,","); instances[datas[2]]=datas[1]; } } close(F); for(d in datas) count++; while(getline y < O) { if((y !~ /^[ \t]*$/)&&(y !~ /@attribute/)&&(y !~ /@data/)&&(y!="")&&(y !~ /@relation/)&&(y !~ /^%/)) { split(y,infos,","); line=0; line=infos[1]+instances[infos[2]]; out=0; out=line for(i=2;i /dev/null; skip=0; } else if((x !~ /^[ \t]*$/)&&(x !~ /@attribute/)&&(x !~ /@data/)&&(x!="")&&(x !~ /@relation/)&&(x !~ /^%/)) { print 0","x; } else print x } close(F); } ' } markInstances() { gawk -v F=$1 -v T=$2 ' BEGIN { count=1; while(getline x < F) { split(x,words," "); if(words[4]=="true") instances[substr(words[5],2,length(words[5])-2)]=1; else instances[substr(words[5],2,length(words[5])-2)]=0; } close(F); while(getline y < T) { split(y,line,","); if(y ~ /@relation/) { print y print "" print "@attribute defective {0,1}" skip=1 } else if (skip==1) { print y > /dev/null; skip=0; } else if((y !~ /^[ \t]*$/)&&(y !~ /@attribute/)&&(y !~ /@data/)&&(y!="")&&(y !~ /@relation/)&&(y !~ /^%/)) { print instances[line[1]]","y; } else print y } close(T); } ' } addUniqueID() { gawk -v F=$1 ' BEGIN{ OFS=FS=","; count=1 while (getline x < F){ if(x ~ /^%/) print x > /dev/null; else if(x ~ /@relation/) { print x print "" print "@attribute UniqueID numeric" skip=1 } else if (skip==1) { print x > /dev/null; skip=0; } else if((x !~ /^[ \t]*$/)&&(x !~ /@attribute/)&&(x !~ /@data/)&&(x!="")&&(x !~ /@relation/)&&(x !~ /^%/)) { print count","x; count++; } else print x } close(F); } ' } ### stop reading. broken after this/ flip() { local data local key local performance while [ `echo $1 | grep "-"` ]; do case $1 in -d|--data) data="$2";; -k|--key) key="$2";; -p|--performance) performance=$3;; *) blabln "'"$1"' unknown\n usage cat file | flip [options]" return 1;; esac shift 2 done gawk ' BEGIN {FS=OFS=","} NR==1 { split(DataStr,TheData,/,/); split(KeyStr,TheKeys,/,/); } /^[ \t]*#[^[#]/ {next} { key=data=""; for(d in TheData) data = data "." $d; for(k in TheKeys) key = key "." $k; Result[key,data]=$Performance; if ($Performance > Max[data] ) { Max[data]=$Performance} Keys[key]=key Datas[data]=data } END {printf "#data" for(K in Keys) printf "," K ",max?" print "" exit for(D in Datas) { printf D for(K in Keys) { printf "," Result[K,D] printf (Result[K,D]== Max[D]) ? ",X" : "," } print "" }} ' DataStr=$data KeyStr=$key Performance=$performance - #| medians | malign } summary() { cd $Tmp local stats="$Safe/demo17.csv" [ ! -f "$stats" ] && demo17 demo18 | flip --data 1 --key 2 --performance 12 printf "\n---| all |------\n\n" winLossTie --input $stats --fields 14 --perform 14 --key 4 --95 --high for d in diabetes autos; do printf "\n---| $d |------\n\n" grep $d $stats > $d.stats; winLossTie --input $d.stats --fields 14 --perform 14 --key 4 --95 --high done } demo101() { local me=demo101 local stats="$HOME/tmp/safe/demo2.log" local learners="aode j48 jrip nb oner" local preps="loggedDiscrete discrete" local datas="cm1 kc1 kc2 kc3_mod mc1_mod mc2_mod mw1_mod pc1 pc2_mod pc3_mod pc4_mod pc5_mod" (echo "#data,prep,attrs,bin,learner,a,b,c,d,acc,pd,pf,prec,bal" for data in $datas; do for learner in $learners; do for prep in $preps; do cat $stats | grep -v '#' | grep $data | grep $prep | grep $learner | sort -t, -n -k 14,14 | medians --start 6 done done done ) > $Safe/$me.log cat $Safe/$me.log } demo102() { cd $Tmp local stats="$Safe/demo101.log" [ ! -f "$stats" ] && demo101 cat $stats | grep "##" | flip --data 1 --key 2,5 --performance 14 } # oner nb j48 #auto 56.6 60.4 85.9* #diabetes 57.2 68.5 69.3* # #demo10() { # demo9 | gawk -F, '/@/ {next} # NF>1 {print $NF}' | sort | uniq -c #} #demo11() { # setup; cd $Tmp # demo9 > data.arff # # c=0.1 # printf "confidence limit for pruning = $c (very selective)\n\n" # j4810c data.arff $c | report 0 3,18,16 # # c=0.25 # printf "confidence limit for pruning = $c (default, less selective)\n\n" # j4810c data.arff $c | report 0 3,18,16 # cd $Here #} #demo1001() { # setUpVars # setUpDirs # setUpSeds # prep # cd $Tmp # pwd # makeshare # worker1001 > log # cp log $Safe/demo1.log # winLossTie log | tee $Safe/demo1.winLossTie #} #demo1002() { # setUpVars # setUpDirs # setUpSeds # prep # cd $Tmp # pwd # makeshare # Learners="j48 jrip oner nb aode" # worker1002 > log # cp log $Safe/demo1.log # winLossTie log | tee $Safe/demo1.winLossTie #} #### start up setup . worker.sh blabln "OurMine version v2.0 (c)2007-2008 tim@menzies.us/gregoryg@csee.wvu.edu under GPLv3" blabln "Too many doings, not enough learnings.\n"