# warning: requires at least 5MB of free disk ########################################################################## # /* vim: set filetype=sh : */ # ourmine : a simple learning environment for data mining # Copyright (C) 2007, Tim Menzies, tim@menzies.us, http://menzies.us # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, version 3. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . ########################################################################## # for more info on command-line weka stuff, see # http://www.cs.waikato.ac.nz/~remco/weka_bn/node13.html Here=`pwd` #### generic stuff reload() { cd $Here . $Ourrc } show() { local goal1="^$1" local com="/^$1 /,/^}/{print}" if (set | grep $goal1 | grep "=" > /tmp/debug) then set | grep $goal1 else set | gawk "$com" fi } blab() { printf "$*" >&2; } blabln() { printf "$*\n" >&2; } #### initialization stuff setup() { setUpVars setUpDirs } lcsee() { alias ls="ls --color" } build() { export AWKPATH="$Here/minerc.lib:$AWKPATH" } setUpVars() { alias ls="ls -G" PROMPT_COMMAND='echo -ne "\033]0;${HOSTNAME}: `pwd`\007"' PS1="Our MINE! \W#\!> " Ourmine="$HOME/opt/ourmine" PATH="$Ourmine/bin:$HOME/bin:$PATH" Safe=$Ourmine/var/safe Dirs="$Our/lib/arffs" LibUrl="http://unbox.org/wisp/trunk/our/minerc.lib/lib.zip" export AWKPATH="$Ourmine/lib:$AWKPATH" Bins=10 Repeats=2; Learners="nb nbk" Data="$Ourmine/lib/arff/uci/discrete/a*.arff $Ourmine/lib/arff/uci/discrete/s*.arff" Audit="pgawk --profile=$HOME/tmp/awkprof.out --dump-variables=$HOME/tmp/awkvars.out --lint" } setUpDirs() { mkdir -p $HOME/tmp mkdir -p /tmp/$USER Tmp=`mktemp -d -p /tmp/$USER` Weka="nice -19 java -Xmx1024M -cp $Tmp/weka.jar " mkdir -p $Tmp mkdir -p $Ourmine/lib # for support code mkdir -p $Ourmine/bin # for our executables mkdir -p $HOME/bin # for your executables mkdir -p $Safe # for stuff you want to keep around if [ ! -f "$Ourmine/lib/lib.zip" ]; then downloads fi # I had too much trouble with pathname syntax problems # on mac, windows, linux, etc. So now I just copy weka.jar # to the working directory (no need for pathnames) cp $Ourmine/lib/weka.jar $Tmp } downloads() { set -x (cd $Ourmine/lib wget -O lib.zip $LibUrl unzip -o lib.zip ) set +x } #### stuff for the turkey experiment setUpSeds() { cat<<-EOF > $Tmp/etc/seds s/loccodeandcomment/loc_code_and_comment/ s/locodeandcomment/loc_code_and_comment/ s/locandcomment/loc_code_and_comment/ s/essential_complexity/ev(g)/ s/cyclomatic_complexity/v(g)/ s/halstead_length/n/ s/halstead_level/l/ s/num_operators/n1/ s/num_operands/n2/ s/unique_operands/uniq_opnd/ s/unique_operators/uniq_op/ s/halstead_content/i/ s/halstead_error_est/b/ s/halstead_prog_time/t/ s/halstead_effort/e/ s/halstead_difficulty/d/ s/halstead_volume/v/ s/loc_comments/loc_comment/ s/design_complexity/iv(g)/ s/locomment/loc_comment/ s/loc_total/loc/ s/locode/loc/ s/[\t ]c[\t ]/ defects / s/[\t ]problems[\t ]/ defects / s/branchcout/branch_count/ s/total_op[\t ]/n1 / s/total_opnd/n2/ s/{no,yes}/{false,true}/ EOF } prep() { for i in $Dirs/mdp/*.arff ; do cat $i | tr A-Z a-z | sed -f $Tmp/etc/seds \ > $Tmp/arff/`basename $i` done } classes() { local brief=0 while [ `echo $1 | grep "-"` ]; do case $1 in -b|--brief) brief=1;; *) blabln "'"$1"' unknown\n usage cat file | classes [options]" return 1;; esac shift 1 done gawk ' BEGIN { OFS=FS="," IGNORECASE=1 Brief=0 } { gsub(/#.*/,"") } /^[ \t]*$/ { next } Data && NF > 1 { Freq[$NF]++ } /@data/ { Data=1 } END { for(N in Freq) if (Brief) { print N } else { print Freq[N],N }} ' Brief=$brief - } intersectAttributes() { # list the intersection of attributes # found in a set of arff files gawk ' BEGIN { IGNORECASE=1; OFS=","} FNR==1 { Files++ } /@attribute/ { Got[$2]++ } END { for(A in Got) if (Got[A]>=Files) print A }' $1 } shared() { for i in `intesectAttributes $Tmp/arff/*.arff | sort | grep -v defects`; do echo $i done echo defects } some() { # generate an arff file that only contains certaina attributes gawk -f some.awk -v Some="$1" $2 } makeshare() { Shared=`shared` for i in $Tmp/arff/*.arff; do echo $i some "$Shared" $i > $Tmp/shared/`basename $i` done } report() { gawk 'BEGIN {RS=""; FS="\n"} NR==1 { M=split(Show,Shows,",") } { R[++N]=indent($0) } END {print " "; for(r=1;r<=M;r++) printf("\n%s",R[Shows[r]]); print ""; } function str(n,chr, out) { chr = chr ? chr : " "; while(n-- > 0) out= out chr; return out } function indent(str, i, out) { for(i=1;i<=NF;i++) out=out str(Indent," ") $i "\n" return out } ' Show=$2 Indent=$1 - } #### end inter intra stuff #### misc utils makeTrainTest() { cat - | someArff --seed $1 --bins $2 --bin $3 } gotwant() { gawk ' BEGIN {Unlog = 0; OFS = "," Ee = 848456353 / 312129649; } NF == 3 { if (UnLog) { print Ee^$2 , Ee^$3 } else { print $2,$3 } } NF == 4 { print $2 , $4 } ' - } abcd() { local goal="true|yes" local before="" local prefix="" local decimals=2 while [ `echo $1 | grep "-"` ]; do case $1 in -d|--decimals) decimals=$2;; -b|--before) before=$2;; -p|--prefix) prefix=$2;; -g|--goal) goal=$2;; *) blabln "'"$1"' unknown\n usage abcd [options]"; return 1;; esac shift 2 done [ -n "$before" ] && printf $before gawk ' BEGIN { Decimals = 3 Got = 1 Want = 2; Prefix = ""; True = "true"; ## define symbol 1 A=B=C=D=0 ; FS=OFS="," GoalPd = 1; GoalPf = 0; } function yes(s) {return s ~ True } function no(s) {return ( yes(s) ? 0 : 1 ) } { sub(/#.*/,"") } /^[ \t]*$/ { next } NF==2 { N++; Predicted=$Got; Actual=$Want; if (Predicted == Actual) Good++; if (no( Actual) && no( Predicted)) A++; if (yes(Actual) && no( Predicted)) B++; if (no( Actual) && yes(Predicted)) C++; if (yes(Actual) && yes(Predicted)) D++; #print N,$0,A,B,C,D } END { OFMT = "%." Decimals "f"; Balance=Precision=Accuracy=Pf=NotPf=Pd=0; if (C+D > 0 ) Precision = D/(C+D); if ((A+B+C+D) > 0) Accuracy = (A+D)/(A+B+C+D); if (A+C > 0 ) Pf = C/(A+C) if (B+D > 0 ) Pd = D/(B+D); if (B+C+D > 0) { # special case- everything misses Balance = 1 - sqrt((GoalPd - Pd)^2 + (GoalPf - Pf)^2)/sqrt(2) } if(Prefix) printf Txt=Prefix OFS; print A,B,C,D, sprintf(OFMT,100*Accuracy), sprintf(OFMT,100*Pd), sprintf(OFMT,100*Pf), sprintf(OFMT,100*Precision), sprintf(OFMT,100*Balance); }' Prefix="$prefix" Decimals="$decimals" True="$goal" - } quartile2tex() { cat - | gawk 'BEGIN { FS="," OFS="&"} /===/ {print $1; last=0 ; rank=1; next} NF==0 { print ""; next} /#/ {next} Last!=NF { print (NF==7 ? "\\scriptsize\\begin{tabular}{rrrr} rank & treatment & median & distribution\\\\" : "stats") ; Last=NF} NF==5 { print statsprint() } NF==7 { print qprint() } function statsprint( sep,i,out) { if ($4 != last) { rank++ } last=$4 out = rank " & " for(i=1;i<=NF;i++) { out = out sep $i sep="&" } return out "\\\\" } function qprint( sep,i,out) { out = "1 & " $1 "&" $4 "&" out=out "\\boxplot{"$2"}{"$3"}{"$4"}{"$5 - $3"}{"$6"}" return out "\\\\" } function trim(s) { gsub(/[ \t]/,"",s); return s } ' # gawk 'BEGIN {FS="\n"; RS=""} # { M=split($0,Lines,/\\t/) # Z=0; # for(I=1;I<=M;I++) { # N=split(Lines[I],Words,/\\t/) # print ++Z " " trim(Lines[I]) # #for(J=1;J<=N; J++) # # print Z " :: " J " :: " trim(Words[J]) # } # } # function trim(s) { # sub(/^[ \t\n]*/,"",s); # sub(/[ \t]\n*$/,"",s); # return s # }' } malign() { cat - | gawk ' BEGIN { Width=1; Gutter=1; OFS=FS=","; } { N++; for(I=1;I<=NF;I++) { if( (L=length($I)) > Max[I]) Max[I]=L; ++Data[N,0]; Data[N,I]=$I; } } END {for(J=1;J<=N;J++) { Str=Sep1=""; if (Data[J,0]>1) { for(I=1;I<=NF;I++) { L=length(Data[J,I]); Str = Str Sep1 \ str(most(Width,Max[I]+Gutter+1)-L," ") \ Data[J,I]; Sep1= OFS; }} else {Str=Data[J,1]} print Str;} } function str(n,c, out) { while(--n > 0) out = out c; return out; } function most(x,y) { return x > y ? x : y; } ' } medians() { local start="2" while [ `echo $1 | grep "-"` ]; do case $1 in -s|--start) start=$2;; *) blabln "'"$1"' unknown\n usage medians [options]"; return 1;; esac shift 2 done gawk ' BEGIN{FS=","} {print} /^[ \t]*$/ {next} /#/ {next} {for(I=Start;I<=NF;I++) { (Data[I,0]++); Data[I,Data[I,0]]=$I } } END{ print "" printf("##"); printf $1 for(I=2;I 2 { for(I=1;I<=Attr;I++) if (I in Num) if ($I !~ /\?/) { if ( ($I +0) < Min) {Bad=1} else {Bad=0} if (Bad) $I= Min; $I=log($I) } print $0 } ' - } winLossTie() { local fields=10 local key=1 local performance=$fields local high=1 local confidence=95 local input="-" while [ `echo $1 | grep "-"` ]; do case $1 in -f|--fields) fields=$2; shift 2;; --99) confidence=99; shift 1;; --95) confidence=95; shift 1;; -k|--key) key=$2; shift 2;; -p|--perform) performance=$2; shift 2;; --high) high=1; shift 1;; --low) high=0; shift 1;; -i|--input) input=$2; shift 2;; *) blabln "'"$1"' unknown\n. usage: winLossTie [options]" return 1;; esac done (echo "#key,ties,win,loss,win-loss @ ${confidence}%" gawk -f mwu.awk Fields=$fields Key=$key Performance=$performance \ High=$high Confidence=$confidence $input | sort -t, -r -n -k 5,5 ) | malign } someArff() { local bins=3 local bin=1 local seed=$RANDOM while [ `echo $1 | grep "-"` ]; do case $1 in -B|--bins) bins=$2;; -b|--bin) bin=$2;; -s|--seed) seed=$2;; -h|--help) cat <<-EOF someArff : divide an arrf file into Bins, create train/test files usage: someArff [flags] arffFile Flags -B, --bins NUM Randomly divide the data into NUM bins -b, --bin NUM Store bin NUM into test.arff and rest into train.arff -s, --seed NUM Set the random number seed to NUM -h, --help Print this text EOF return 1;; *) blabln "'"$1"' unknown\n usage cat file | someArff [options]" return 1;; esac shift 2 done gawk ' BEGIN { IGNORECASE=1; Trainf="train.arff"; Testf="test.arff"; Bins=3; Bin=1; Seed=1; } { sub(/#.*/,"") } { sub(/\%.*/,"") } /^[ \t]*$/ { next } /@relation/ { Seed ? srand(Seed) : srand(1) } /@relation/ { printf "">Trainf; printf "">Testf } /@relation/,/@data/ { print $0 >> Trainf; print $0 >> Testf; next } { Line[rand()] = $0; Lines++ } END { Start = Lines/Bins * (Bin - 1) ; Stop = Lines/Bins * Bin; for(I in Line) { N++; What = (N>= Start && N < Stop) ? Testf : Trainf print Line[I]>>What; } } ' Seed=$seed Bins=$bins Bin=$bin - } #### Weka stuff ## pruning columns removeAttributes() { blab "/" $Weka weka.filters.unsupervised.attribute.Remove \ -R "${1}-${2}" -i $3 -o tmp.arff set +x cat tmp.arff } ## discretization discretizeViaFayyadIrani() { blab "x" $Weka weka.filters.supervised.attribute.Discretize \ -c last -R first-last -i $1 -o tmp.arff cat tmp.arff } ## feature subset selection rankViaInfoGain() { blab "<" $Weka weka.filters.supervised.attribute.AttributeSelection \ -S "weka.attributeSelection.Ranker -T -2.7976931348623157E308 -N -1" \ -E "weka.attributeSelection.InfoGainAttributeEval" \ -i $1 -o tmp.arff cat tmp.arff } ### learners ## classifiers # rule-based classifiers oner() { blab "1" $Weka weka.classifiers.rules.OneR \ -B 6 \ -p 0 -t $1 -T $2 } jrip() { blab "j" $Weka weka.classifiers.rules.JRip \ -F 3 -N 2.0 -O 2 -S 1 \ -p 0 -t $1 -T $2 } jrip10() { blab "j" $Weka weka.classifiers.rules.JRip \ -F 3 -N 2.0 -O 2 -S 1 \ -t $1 } # bayesian classifiers aode() { blab "a" $Weka weka.classifiers.bayes.AODE \ "-F" 0 \ -p 0 -t $1 -T $2 } aode10() { blab "a" $Weka weka.classifiers.bayes.AODE \ "-F" 0 \ -t $1 -T $2 } nbd() { blab "N" gawk -f nbd.awk Brief=1 Pass=1 $1 Pass=2 $2 | gawk 'BEGIN {FS=","} {print "-",$1,"-",$2}' } nba() { blab "A" gawk -f nba.awk Debug=1 Pass=1 $1 Pass=2 $1 Pass=3 $2 } nb() { blab "n" $Weka weka.classifiers.bayes.NaiveBayes \ -p 0 -t $1 -T $2 } nb10() { blab "n" $Weka weka.classifiers.bayes.NaiveBayes \ -i -t $1 } nbk() { blab "k" $Weka weka.classifiers.bayes.NaiveBayes \ -K \ -p 0 -t $1 -T $2 } # decision tree learners j48() { blab "c" $Weka weka.classifiers.trees.J48 \ -C 0.25 -M 2 \ -p 0 -t $1 -T $2 } j4810() { blab "c" $Weka weka.classifiers.trees.J48 \ -C 0.25 -M 2 \ -i -t $1 } j4810c() { blab "c$2" $Weka weka.classifiers.trees.J48 \ -C $2 -M 2 \ -i -t $1 } ## linear-model learners lsr() { blab "L" $Weka weka.classifiers.functions.LinearRegression \ -S 0 -R 1.0E-8 \ -p 0 -t $1 -T $2 } m5p() { blab "P" $Weka weka.classifiers.trees.M5P \ -p 0 -t $1 -T $2 } ## nearest neighbor 1Bkx() { blab "N" $Weka weka.classifiers.lazy.IBk \ -K 1 -W 0 -X -E \ -p 0 -t $1 -T $2 } 1Bk() { blab "n" $Weka weka.classifiers.lazy.IBk \ -K -1 -W 0 -E \ -p 0 -t $1 -T $2 } ## association rule learners apriori() { blab "A" $Weke weka.associations.Apriori \ -N 10 -T 0 -C 0.9 -D 0.05 -U 1.0 -M 0.1 -S -1.0 \ -p 0 -t $1 -T $2 } #### teaching demos weather.nominal() { cat<<-EOF @relation weather.nominal @attribute outlook {sunny, overcast, rainy} @attribute temperature {hot, mild, cool} @attribute humidity {high, normal} @attribute windy {TRUE, FALSE} @attribute play {yes, no} @data sunny,hot,high,FALSE,no sunny,hot,high,TRUE,no overcast,hot,high,FALSE,yes rainy,mild,high,FALSE,yes rainy,cool,normal,FALSE,yes rainy,cool,normal,TRUE,no overcast,cool,normal,TRUE,yes sunny,mild,high,FALSE,no sunny,cool,normal,FALSE,yes rainy,mild,normal,FALSE,yes sunny,mild,normal,TRUE,yes overcast,mild,high,TRUE,yes overcast,hot,normal,FALSE,yes rainy,mild,high,TRUE,no EOF } auto93() { cat<<-EOF @relation 'auto93.names' @attribute Manufacturer { Acura, Audi, BMW, Buick, Cadillac, Chevrolet, Chrysler, Dodge, Eagle, Ford, Geo, Honda, Hyundai, Infiniti, Lexus, Lincoln, Mazda, Mercedes-Benz, Mercury, Mitsubishi, Nissan, Oldsmobile, Plymouth, Pontiac, Saab, Saturn, Subaru, Suzuki, Toyota, Volkswagen, Volvo} @attribute Type { Small, Midsize, Compact, Large, Sporty, Van} @attribute City_MPG real @attribute Highway_MPG real @attribute Air_Bags_standard { 0, 2, 1} @attribute Drive_train_type { 1, 0, 2} @attribute Number_of_cylinders real @attribute Engine_size real @attribute Horsepower real @attribute RPM real @attribute Engine_revolutions_per_mile real @attribute Manual_transmission_available { 1, 0} @attribute Fuel_tank_capacity real @attribute Passenger_capacity real @attribute Length real @attribute Wheelbase real @attribute Width real @attribute U-turn_space real @attribute Rear_seat_room real @attribute Luggage_capacity real @attribute Weight real @attribute Domestic { 0, 1} @attribute class real @data Acura,Small,25,31,0,1,4,1.8,140,6300,2890,1,13.2,5,177,102,68,37,26.5,11,2705,0,15.9 Acura,Midsize,18,25,2,1,6,3.2,200,5500,2335,1,18,5,195,115,71,38,30,15,3560,0,33.9 Audi,Compact,20,26,1,1,6,2.8,172,5500,2280,1,16.9,5,180,102,67,37,28,14,3375,0,29.1 Audi,Midsize,19,26,2,1,6,2.8,172,5500,2535,1,21.1,6,193,106,70,37,31,17,3405,0,37.7 BMW,Midsize,22,30,1,0,4,3.5,208,5700,2545,1,21.1,4,186,109,69,39,27,13,3640,0,30 Buick,Midsize,22,31,1,1,4,2.2,110,5200,2565,0,16.4,6,189,105,69,41,28,16,2880,1,15.7 Buick,Large,19,28,1,1,6,3.8,170,4800,1570,0,18,6,200,111,74,42,30.5,17,3470,1,20.8 Buick,Large,16,25,1,0,6,5.7,180,4000,1320,0,23,6,216,116,78,45,30.5,21,4105,1,23.7 Buick,Midsize,19,27,1,1,6,3.8,170,4800,1690,0,18.8,5,198,108,73,41,26.5,14,3495,1,26.3 Cadillac,Large,16,25,1,1,8,4.9,200,4100,1510,0,18,6,206,114,73,43,35,18,3620,1,34.7 Cadillac,Midsize,16,25,2,1,8,4.6,295,6000,1985,0,20,5,204,111,74,44,31,14,3935,1,40.1 Chevrolet,Compact,25,36,0,1,4,2.2,110,5200,2380,1,15.2,5,182,101,66,38,25,13,2490,1,13.4 Chevrolet,Compact,25,34,1,1,4,2.2,110,5200,2665,1,15.6,5,184,103,68,39,26,14,2785,1,11.4 Chevrolet,Sporty,19,28,2,0,6,3.4,160,4600,1805,1,15.5,4,193,101,74,43,25,13,3240,1,15.1 Chevrolet,Midsize,21,29,0,1,4,2.2,110,5200,2595,0,16.5,6,198,108,71,40,28.5,16,3195,1,15.9 Chevrolet,Van,18,23,0,1,6,3.8,170,4800,1690,0,20,7,178,110,74,44,30.5,?,3715,1,16.3 Chevrolet,Van,15,20,0,2,6,4.3,165,4000,1790,0,27,8,194,111,78,42,33.5,?,4025,1,16.6 Chevrolet,Large,17,26,1,0,8,5,170,4200,1350,0,23,6,214,116,77,42,29.5,20,3910,1,18.8 Chevrolet,Sporty,17,25,1,0,8,5.7,300,5000,1450,1,20,2,179,96,74,43,?,?,3380,1,38 Chrysler,Large,20,28,2,1,6,3.3,153,5300,1990,0,18,6,203,113,74,40,31,15,3515,1,18.4 Chrysler,Compact,23,28,2,1,4,3,141,5000,2090,0,16,6,183,104,68,41,30.5,14,3085,1,15.8 Chrysler,Large,20,26,1,1,6,3.3,147,4800,1785,0,16,6,203,110,69,44,36,17,3570,1,29.5 Dodge,Small,29,33,0,1,4,1.5,92,6000,3285,1,13.2,5,174,98,66,32,26.5,11,2270,1,9.2 Dodge,Small,23,29,1,1,4,2.2,93,4800,2595,1,14,5,172,97,67,38,26.5,13,2670,1,11.3 Dodge,Compact,22,27,1,1,4,2.5,100,4800,2535,1,16,6,181,104,68,39,30.5,14,2970,1,13.3 Dodge,Van,17,21,1,2,6,3,142,5000,1970,0,20,7,175,112,72,42,26.5,?,3705,1,19 Dodge,Midsize,21,27,1,1,4,2.5,100,4800,2465,0,16,6,192,105,69,42,30.5,16,3080,1,15.6 Dodge,Sporty,18,24,1,2,6,3,300,6000,2120,1,19.8,4,180,97,72,40,20,11,3805,1,25.8 Eagle,Small,29,33,0,1,4,1.5,92,6000,2505,1,13.2,5,174,98,66,36,26.5,11,2295,1,12.2 Eagle,Large,20,28,2,1,6,3.5,214,5800,1980,0,18,6,202,113,74,40,30,15,3490,1,19.3 Ford,Small,31,33,0,1,4,1.3,63,5000,3150,1,10,4,141,90,63,33,26,12,1845,1,7.4 Ford,Small,23,30,0,1,4,1.8,127,6500,2410,1,13.2,5,171,98,67,36,28,12,2530,1,10.1 Ford,Compact,22,27,0,1,4,2.3,96,4200,2805,1,15.9,5,177,100,68,39,27.5,13,2690,1,11.3 Ford,Sporty,22,29,1,0,4,2.3,105,4600,2285,1,15.4,4,180,101,68,40,24,12,2850,1,15.9 Ford,Sporty,24,30,1,1,4,2,115,5500,2340,1,15.5,4,179,103,70,38,23,18,2710,1,14 Ford,Van,15,20,1,2,6,3,145,4800,2080,1,21,7,176,119,72,45,30,?,3735,1,19.9 Ford,Midsize,21,30,1,1,6,3,140,4800,1885,0,16,5,192,106,71,40,27.5,18,3325,1,20.2 Ford,Large,18,26,1,0,8,4.6,190,4200,1415,0,20,6,212,114,78,43,30,21,3950,1,20.9 Geo,Small,46,50,0,1,3,1,55,5700,3755,1,10.6,4,151,93,63,34,27.5,10,1695,0,8.4 Geo,Sporty,30,36,1,1,4,1.6,90,5400,3250,1,12.4,4,164,97,67,37,24.5,11,2475,0,12.5 Honda,Sporty,24,31,2,1,4,2.3,160,5800,2855,1,15.9,4,175,100,70,39,23.5,8,2865,0,19.8 Honda,Small,42,46,1,1,4,1.5,102,5900,2650,1,11.9,4,173,103,67,36,28,12,2350,0,12.1 Honda,Compact,24,31,2,1,4,2.2,140,5600,2610,1,17,4,185,107,67,41,28,14,3040,0,17.5 Hyundai,Small,29,33,0,1,4,1.5,81,5500,2710,1,11.9,5,168,94,63,35,26,11,2345,0,8 Hyundai,Small,22,29,0,1,4,1.8,124,6000,2745,1,13.7,5,172,98,66,36,28,12,2620,0,10 Hyundai,Sporty,26,34,0,1,4,1.5,92,5550,2540,1,11.9,4,166,94,64,34,23.5,9,2285,0,10 Hyundai,Midsize,20,27,0,1,4,2,128,6000,2335,1,17.2,5,184,104,69,41,31,14,2885,0,13.9 Infiniti,Midsize,17,22,1,0,8,4.5,278,6000,1955,0,22.5,5,200,113,72,42,29,15,4000,0,47.9 Lexus,Midsize,18,24,1,1,6,3,185,5200,2325,1,18.5,5,188,103,70,40,27.5,14,3510,0,28 Lexus,Midsize,18,23,2,0,6,3,225,6000,2510,1,20.6,4,191,106,71,39,25,9,3515,0,35.2 Lincoln,Midsize,17,26,2,1,6,3.8,160,4400,1835,0,18.4,6,205,109,73,42,30,19,3695,1,34.3 Lincoln,Large,18,26,2,0,8,4.6,210,4600,1840,0,20,6,219,117,77,45,31.5,22,4055,1,36.1 Mazda,Small,29,37,0,1,4,1.6,82,5000,2370,1,13.2,4,164,97,66,34,27,16,2325,0,8.3 Mazda,Small,28,36,0,1,4,1.8,103,5500,2220,1,14.5,5,172,98,66,36,26.5,13,2440,0,11.6 Mazda,Compact,26,34,1,1,4,2.5,164,5600,2505,1,15.5,5,184,103,69,40,29.5,14,2970,0,16.5 Mazda,Van,18,24,0,2,6,3,155,5000,2240,0,19.6,7,190,110,72,39,27.5,?,3735,0,19.1 Mazda,Sporty,17,25,1,0,?,1.3,255,6500,2325,1,20,2,169,96,69,37,?,?,2895,0,32.5 Mercedes-Benz,Compact,20,29,1,0,4,2.3,130,5100,2425,1,14.5,5,175,105,67,34,26,12,2920,0,31.9 Mercedes-Benz,Midsize,19,25,2,0,6,3.2,217,5500,2220,0,18.5,5,187,110,69,37,27,15,3525,0,61.9 Mercury,Sporty,23,26,1,1,4,1.6,100,5750,2475,1,11.1,4,166,95,65,36,19,6,2450,1,14.1 Mercury,Midsize,19,26,0,0,6,3.8,140,3800,1730,0,18,5,199,113,73,38,28,15,3610,1,14.9 Mitsubishi,Small,29,33,0,1,4,1.5,92,6000,2505,1,13.2,5,172,98,67,36,26,11,2295,0,10.3 Mitsubishi,Midsize,18,24,1,1,6,3,202,6000,2210,0,19,5,190,107,70,43,27.5,14,3730,0,26.1 Nissan,Small,29,33,1,1,4,1.6,110,6000,2435,1,13.2,5,170,96,66,33,26,12,2545,0,11.8 Nissan,Compact,24,30,1,1,4,2.4,150,5600,2130,1,15.9,5,181,103,67,40,28.5,14,3050,0,15.7 Nissan,Van,17,23,0,1,6,3,151,4800,2065,0,20,7,190,112,74,41,27,?,4100,0,19.1 Nissan,Midsize,21,26,1,1,6,3,160,5200,2045,0,18.5,5,188,104,69,41,28.5,14,3200,0,21.5 Oldsmobile,Compact,24,31,0,1,4,2.3,155,6000,2380,0,15.2,5,188,103,67,39,28,14,2910,1,13.5 Oldsmobile,Midsize,23,31,1,1,4,2.2,110,5200,2565,0,16.5,5,190,105,70,42,28,16,2890,1,16.3 Oldsmobile,Van,18,23,0,1,6,3.8,170,4800,1690,0,20,7,194,110,74,44,30.5,?,3715,1,19.5 Oldsmobile,Large,19,28,1,1,6,3.8,170,4800,1570,0,18,6,201,111,74,42,31.5,17,3470,1,20.7 Plymouth,Sporty,23,30,0,2,4,1.8,92,5000,2360,1,15.9,4,173,97,67,39,24.5,8,2640,1,14.4 Pontiac,Small,31,41,0,1,4,1.6,74,5600,3130,1,13.2,4,177,99,66,35,25.5,17,2350,1,9 Pontiac,Compact,23,31,0,1,4,2,110,5200,2665,1,15.2,5,181,101,66,39,25,13,2575,1,11.1 Pontiac,Sporty,19,28,2,0,6,3.4,160,4600,1805,1,15.5,4,196,101,75,43,25,13,3240,1,17.7 Pontiac,Midsize,19,27,0,1,6,3.4,200,5000,1890,1,16.5,5,195,108,72,41,28.5,16,3450,1,18.5 Pontiac,Large,19,28,2,1,6,3.8,170,4800,1565,0,18,6,177,111,74,43,30.5,18,3495,1,24.4 Saab,Compact,20,26,1,1,4,2.1,140,6000,2910,1,18,5,184,99,67,37,26.5,14,2775,0,28.7 Saturn,Small,28,38,1,1,4,1.9,85,5000,2145,1,12.8,5,176,102,68,40,26.5,12,2495,1,11.1 Subaru,Small,33,37,0,2,3,1.2,73,5600,2875,1,9.2,4,146,90,60,32,23.5,10,2045,0,8.4 Subaru,Small,25,30,0,2,4,1.8,90,5200,3375,1,15.9,5,175,97,65,35,27.5,15,2490,0,10.9 Subaru,Compact,23,30,1,2,4,2.2,130,5600,2330,1,15.9,5,179,102,67,37,27,14,3085,0,19.5 Suzuki,Small,39,43,0,1,3,1.3,70,6000,3360,1,10.6,4,161,93,63,34,27.5,10,1965,0,8.6 Toyota,Small,32,37,1,1,4,1.5,82,5200,3505,1,11.9,5,162,94,65,36,24,11,2055,0,9.8 Toyota,Sporty,25,32,1,1,4,2.2,135,5400,2405,1,15.9,4,174,99,69,39,23,13,2950,0,18.4 Toyota,Midsize,22,29,1,1,4,2.2,130,5400,2340,1,18.5,5,188,103,70,38,28.5,15,3030,0,18.2 Toyota,Van,18,22,1,2,4,2.4,138,5000,2515,1,19.8,7,187,113,71,41,35,?,3785,0,22.7 Volkswagen,Small,25,33,0,1,4,1.8,81,5500,2550,1,12.4,4,163,93,63,34,26,10,2240,0,9.1 Volkswagen,Van,17,21,0,1,5,2.5,109,4500,2915,1,21.1,7,187,115,72,38,34,?,3960,0,19.7 Volkswagen,Compact,21,30,0,1,4,2,134,5800,2685,1,18.5,5,180,103,67,35,31.5,14,2985,0,20 Volkswagen,Sporty,18,25,0,1,6,2.8,178,5800,2385,1,18.5,4,159,97,66,36,26,15,2810,0,23.3 Volvo,Compact,21,28,1,0,4,2.3,114,5400,2215,1,15.8,5,190,104,67,37,29.5,14,2985,0,22.7 Volvo,Midsize,20,28,2,1,5,2.4,168,6200,2310,1,19.3,5,184,105,69,38,30,15,3245,0,26.7 EOF } auto93discreteClass() { #some learners can't handle auto93's numeric class #so we discretize the class. Note that this is a pretty # dumb discretizer. auto93 | gawk 'BEGIN {IGNORECASE=1; OFS=","; Round=20} In && NF > 1 {$NF= "_"int($NF/Round+0.5)*Round} $2 =="class" {$3 = "{_0,_20,_40,_60}"} /@data/ {In=1; FS=","} { print}' } #### some workers worker1001() { local learners1="j48 oner aode nb nbk jrip" local learners2="aode" local repeats=10; local bins=10; local datas="cm1 kc1 kc2 kc3_mod mc1_mod mc2_mod mw1_mod pc1 pc2_mod pc3_mod pc4_mod pc5_mod" for one in $data; do cp $one raw.arff stem=`basename $one` stem=${stem/.*/} logNumbers raw.arff > logged.arff discretizeViaFayyadIrani raw.arff > discrete.arff discretizeViaFayyadIrani logged.arff > loggedDiscrete.arff for x in raw discrete logged loggedDiscrete; do rankViaInfoGain $x.arff > ranked.arff for attrs in 4 7 13 16; do removeAttributes $attrs 16 $x.arff > ranked${attrs}.arff blab "$stem $x $attrs " echo "#file,x,attrs,bin,learner,a,b,c,d,acc,pd,pf,prec,g" for((r=1;r<=$repeats;r++)); do seed=$RANDOM for((bin=1; bin <= $bins ; bin++)); do blab "$bin" makeTrainTest $seed $bins $bin ranked${attrs}.arff for learner in $learners1; do $learner train.arff test.arff | gotwant | abcd "$stem,$x,$attrs,$Bin,$Learner" done if [ $x != "raw" ]; then $learner train.arff test.arff | gotwant | abcd "$stem,$x,$attrs,$Bin,$Learner" fi done done | medians blabln done done done | tee $Safe/worker1001.log } worker1002() { for one in $Data; do cp $one raw.arff stem=`basename $one` stem=${stem/.*/} logNumbers raw.arff > logged.arff discretizeViaFayyadIrani raw.arff > discrete.arff discretizeViaFayyadIrani logged.arff > loggedDiscrete.arff for x in discrete loggedDiscrete; do rankViaInfoGain $x.arff > ranked.arff for Attrs in 4 7 13 16; do removeAttributes $Attrs 16 $x.arff > ranked${Attrs}.arff blab "$stem $x $Attrs " Seed=$RANDOM echo "#file,x,attrs,bin,learner,a,b,c,d,acc,pd,pf,prec,g" for((R=1;R<=$Repeats;R++)); do for((Bin=1; Bin <= $Bins ; Bin++)); do blab "." makeTrainTest $Seed $Bins $Bin ranked${Attrs}.arff for Learner in $Learners; do $Learner train.arff test.arff | gotwant | abcd "$stem,$x,$Attrs,$Bin,$Learner" done done done | medians blabln done done done } #### some demos demo3() { cd $Tmp weather.nominal > data.arff j4810 data.arff cd $Here } demo4() { cd $Tmp weather.nominal > data.arff j4810 data.arff | report 4 3 cd $Here } demo5() { cd $Tmp weather.nominal > data.arff j4810 data.arff | report 4 3,18,16 cd $Here } demo5a() { j4810 $Ourmine/lib/arffs/uci/discrete/soybean.arff } demo6() { cd $Tmp weather.nominal > data.arff nb10 data.arff cd $Here } demo7() { cd $Tmp weather.nominal > data.arff nb10 data.arff | report 4 2,3,4,5 cd $Here } demo8() { cd $Tmp weather.nominal > data.arff nb10 data.arff | report 4 18,16 cd $Here } demo9() { cd $Tmp weather.nominal > data.arff j48 data.arff data.arff cd $Home } demo10() { demo9 | gotwant } demo11() { demo10 | abcd --before "\na,b,c,d,acc,pd,pf,prec,bal\n" --decimals 1 | malign } demo12() { cd $Tmp auto93discreteClass > data.arff j48 data.arff data.arff | gotwant cd $Here } demo13() { demo12 | gawk -F, '$1 != $2' } demo14() { demo12 | for goal in _0 _20 _40 ; do abcd --goal "$goal" \ --prefix "auto93d,$goal" \ --before "\n#data,goal,a,b,c,d,acc,pd,pf,prec,bal\n" \ --decimals 1 done | malign } demo15() { cd $Tmp ( echo "#data,bin, a,b,c,d,acc,pd,pf,prec,bal" seed=$RANDOM; for((bin=1;bin<=10;bin++)); do blab "$bin" auto93discreteClass | someArff --seed $seed --bins $Bins --bin $bin j48 train.arff test.arff | gotwant | abcd --goal "_20" --prefix "auto93,$bin" --decimals 1 done | sort -t, -n -k 11,11 ) | malign > demo15.csv #each bin blabln " " echo ""; cat demo15.csv cp demo15.csv $Safe/demo15.csv cd $Here } demo16() { cd $Tmp ( echo "#data,repeat,bin,a,b,c,d,acc,pd,pf,prec,bal" for((r=1;r<=2;r++)); do blab "repeat=$r " seed=$RANDOM; for((bin=1;bin<=5;bin++)); do blab "$bin" auto93discreteClass | someArff --seed $seed --bins $Bins --bin $bin j48 train.arff test.arff | gotwant | abcd --goal "_20" --prefix "auto93,$r,$bin" --decimals 1 done blabln done | sort -t, -n -k 12,12 ) | malign > demo15.csv #each bin blabln " " echo ""; cat demo15.csv cp demo15.csv $Safe/demo15.csv cd $Here } demo17() { local me=demo17 local bins=10 local repeats=2 local learners="oner nb j48" local datas="diabetes autos" cd $Tmp (echo "#data,repeat,bin,learner,goal,a,b,c,d,acc,pd,pf,prec,bal" for data in $datas; do arff=$Ourmine/lib/arffs/uci/discrete/$data.arff for((r=1;r<=$repeats;r++)); do blab "data=$data repeat=$r " seed=$RANDOM; for((bin=1;bin<=$bins;bin++)); do blab "$bin" cat $arff | someArff --seed $seed --bins $bins --bin $bin goals=`cat $arff | classes --brief` for learner in $learners; do $learner train.arff test.arff | gotwant > results.dat for goal in $goals; do cat results.dat | abcd --goal "$goal" \ --prefix "$data,$r,$bin,$learner,$goal" \ --decimals 1 done done done blabln done done ) | sort -t, -n -k 14,14 | malign > $me.csv #each bin blabln " " echo ""; cat $me.csv cp $me.csv $Safe/$me.csv cd $Here } demo18() { local stats="$Safe/demo17.csv" local learners="nb j48 oner" local datas="diabetes autos" [ ! -f "$stats" ] && demo17 (echo "#data,learner,goal,a,b,c,d,acc,pd,pf,prec,bal" for data in $datas; do for learner in $learners; do echo -n "$data,$learner, " grep $data $stats | grep $learner | medians --start 6 | grep "##" done done ) | malign | sort -t, -n -k 12,12 } demo19() { local stats="$Safe/demo17.csv" [ ! -f "$stats" ] && demo17 winLossTie --input $stats --fields 14 --perform 14 --key 4 --95 --high } demo21() { local me=demo21 local rs=10 local bins=10 local secrets=$HOME/svns/nextgen/trunk/doc/arffs/raw local attrs="classic_metrics inter_metrics intra_metrics" local arffs=`ls $secrets/function*.arff | grep -v pair` local arffs="$secrets/function_data.arff" cd $Tmp for arff in $arffs; do cat $arff | logNumbers > logged.arff for((r=1;r<=rs;r++)); do local seed=$RANDOM for((b=1;b<=bins;b++)); do cat logged.arff | someArff --bins $bins --bin $b --seed $seed for attr in $attrs; do local want=`cat $secrets/$attr; echo defects` some "$want" train.arff > trainSome.arff some "$want" test.arff > testSome.arff nb trainSome.arff testSome.arff | gotwant | abcd --goal "true" --decimals 2 \ --prefix "`basename $arff`,$attr,$r,$b" done done done done | tee $Safe/$me.log sort -t, -n -k 13,13 $Safe/$me.log | malign } demo22() { local attrs="classic_metrics inter_metrics intra_metrics" local log=$Safe/demo21.log [ ! -f $log ] && demo21 echo "" (echo "#treatment,min,q1,median,q3,max," for attr in $attrs; do echo -n "$attr," grep $attr $log | cut -d, -f 13 | quartile | malign done | sort -t, -r -k 4,4 ) | malign echo "" winLossTie --input $log --fields 13 --perform 13 --key 2 --95 --high echo "" winLossTie --input $log --fields 13 --perform 13 --key 2 --99 --high } funs() { gawk 'In && NF>20 {print $1} /@data/ {In=1}' FS="," IGNORECASE=1 $1 | sort | uniq } # logNumbers miss row 1 # only work on the dynamic metrics parts() { local me=parts local secrets=$HOME/svns/nextgen/trunk/doc/arffs/raw local all="$secrets/function_data.arff" local parts="`ls $secrets/function_[0-9]*.arff`"; local inter=`echo function_name; cat $secrets/interClassic_metrics; echo defects` local inter1=`cat $secrets/interClassic_metrics; echo defects` cd $Tmp some "$inter" $all | logNumbers > all_interLogged.arff for part in `ls $secrets/function_[0-9]*.arff`; do funs $part > funs.out some "$inter" $part | logNumbers > isolated0.arff some "$inter1" isolated0.arff > isolated.arff gawk 'BEGIN { FS=","; while (getline want < "funs.out" ) Wants[want]=1; close("funs.out")} /@/ { print; next } $1 in Wants { print }' all_interLogged.arff > ensembled0.arff some "$inter1" ensembled0.arff > ensembled.arff gawk 'BEGIN { FS=","; while (getline want < "funs.out" ) Wants[want]=1; close("funs.out")} /@/ { print; next; } (! ($1 in Wants)) { print }' all_interLogged.arff > otherEnsembled0.arff some "$inter1" otherEnsembled0.arff > otherEnsembled.arff nb otherEnsembled.arff ensembled.arff | gotwant | abcd --goal "true" --prefix "`basename $part`,ensemble" nb otherEnsembled.arff isolated.arff | gotwant | abcd --goal "true" --prefix "`basename $part`,isolated" done > $Safe/$me.out cd $Here } demo23() { log=$Safe/parts.out [ ! -f "$log" ] && parts (echo "#data,type,a,b,c,d,acc,pd,pf,precision,balance" cat $log) | malign } demo24() { local me=demo24 local datas="weather.nominal diabetes hypothyroid" cd $Tmp for data in $datas; do arff=$Ourmine/lib/arffs/uci/discrete/$data.arff printf "\n---| `basename $arff` |-----------\n\n" j4810 $arff | report 4 3 cat $arff | gains | sort -n -t, -r -k 3,3 done } demo25() { local me=demo25 local datas="anneal audiology breast-cancer kr-vs-kp mushroom primary-tumor soybean splice vote weather.nominal" local repeats=10 local bins=10 local learners="j48 nb nbd" cd $Tmp (echo "#data,repeat,bin,learner,goal,a,b,c,d,acc,pd,pf,prec,bal" for data in $datas; do arff=$Ourmine/lib/arffs/uci/discrete/$data.arff for((r=1;r<=$repeats;r++)); do blab "data=$data repeat=$r " seed=$RANDOM; for((bin=1;bin<=$bins;bin++)); do blab "$bin" cat $arff | someArff --seed $seed --bins $bins --bin $bin goals=`cat $arff | classes --brief` for learner in $learners; do $learner train.arff test.arff | gotwant > results.dat for goal in $goals; do cat results.dat | abcd --goal "$goal" \ --prefix "$data,$r,$bin,$learner,$goal" \ --decimals 1 done done done blabln done done ) | sort -t, -n -k 14,14 | malign > $me.csv blabln " " echo ""; cat $me.csv cp $me.csv $Safe/$me.csv cd $Here } demo26() { local log=$Safe/demo25.csv [ ! -f $log ] && demo25 local learners="nbd nb j48" local datas="anneal audiology breast-cancer kr-vs-kp mushroom primary-tumor soybean splice vote weather.nominal" cd $Tmp winLossTie --input $log --fields 14 --perform 14 --key 4 --95 --high for data in $datas; do printf "\n---| $data |-----------------\n\n"; cat $log | grep $data > some.dat (echo "#learner,min,q1,median,q3,max," for learner in $learners; do echo -n "$learner,"; grep $learner some.dat | cut -d, -f 14 | quartile done ) | malign echo "" winLossTie --input some.dat --fields 14 --perform 14 --key 4 --95 --high done cd $Here } demo27() { cd $Tmp local me=demo27 local datas="anneal audiology breast-cancer kr-vs-kp mushroom primary-tumor soybean splice vote " for data in $datas; do arff=$Ourmine/lib/arffs/uci/discrete/$data.arff cat $arff | someArff --seed $RANDOM --bins 10 --bin 1 nba train.arff test.arff > out cat out | gawk -F, 'BEGIN {FS=OFS=","} $2 == $NF {print $0}' | cut -d, -f 1 > good cat out | gawk -F, 'BEGIN {FS=OFS=","} $2 != $NF {print $0}' | cut -d, -f 1 > bad echo "" for i in good bad; do printf "\n ---| $data $i |-----------------------\n\n" cat $i | gawk ' function str(n,chr, out) { chr = chr ? chr : " "; while(n-- > 0) out= out chr; return out } {n[int($0)]++; N++} END { for(i in n) { m = int(100*n[i]/N/3) if (m) print i "," m*3, "," str(m,"X") str(33-m,".") } } ' | sort -r -n | malign done done | tee $Safe/$me.csv cd $Here } demo28() { # nick's stuff cat nick/3x10all.csv | gawk -F, 'NR==1 {print $0 "," f;next} {print $0 "," 2*$11*$13/($11 + $13 + 0.0000001)}' > /tmp/f for i in a b c d e f g h i j k l m; do grep ",$i," /tmp/f | cut -d, -f 11 |gawk 'NR==1 {next} {print}' > /tmp/$i.pd; done for i in a b c d e f g h i j k l m; do grep ",$i," /tmp/f | cut -d, -f 13 |gawk 'NR==1 {next} {print}' > /tmp/$i.prec; done for i in a b c d e f g h i j k l m; do grep ",$i," /tmp/f | cut -d, -f 15 |gawk 'NR==1 {next} {print}' > /tmp/$i.f; done echo ( for i in a b c d e f g h i j k l m; do echo -n "pd,$i," ; cat /tmp/$i.pd | quartile; done winLossTie --input /tmp/f --fields 15 --key 4 --perform 11 --95 --high echo for i in a b c d e f g h i j k l m; do echo -n "prec,$i," ; cat /tmp/$i.prec | quartile; done winLossTie --input /tmp/f --fields 15 --key 4 --perform 13 --95 --high echo for i in a b c d e f g h i j k l m; do echo -n "f,$i," ; cat /tmp/$i.f | quartile; done winLossTie --input /tmp/f --fields 15 --key 4 --perform 15 --95 --high ) | tee $Safe/demo28.log } ### stop reading. broken after this/ flip() { local data local key local performance while [ `echo $1 | grep "-"` ]; do case $1 in -d|--data) data="$2";; -k|--key) key="$2";; -p|--performance) performance=$3;; *) blabln "'"$1"' unknown\n usage cat file | flip [options]" return 1;; esac shift 2 done gawk ' BEGIN {FS=OFS=","} NR==1 { split(DataStr,TheData,/,/); split(KeyStr,TheKeys,/,/); } /^[ \t]*#[^[#]/ {next} { key=data=""; for(d in TheData) data = data "." $d; for(k in TheKeys) key = key "." $k; Result[key,data]=$Performance; if ($Performance > Max[data] ) { Max[data]=$Performance} Keys[key]=key Datas[data]=data } END {printf "#data" for(K in Keys) printf "," K ",max?" print "" exit for(D in Datas) { printf D for(K in Keys) { printf "," Result[K,D] printf (Result[K,D]== Max[D]) ? ",X" : "," } print "" }} ' DataStr=$data KeyStr=$key Performance=$performance - #| medians | malign } summary() { cd $Tmp local stats="$Safe/demo17.csv" [ ! -f "$stats" ] && demo17 demo18 | flip --data 1 --key 2 --performance 12 printf "\n---| all |------\n\n" winLossTie --input $stats --fields 14 --perform 14 --key 4 --95 --high for d in diabetes autos; do printf "\n---| $d |------\n\n" grep $d $stats > $d.stats; winLossTie --input $d.stats --fields 14 --perform 14 --key 4 --95 --high done } demo101() { local me=demo101 local stats="$HOME/tmp/safe/demo2.log" local learners="aode j48 jrip nb oner" local preps="loggedDiscrete discrete" local datas="cm1 kc1 kc2 kc3_mod mc1_mod mc2_mod mw1_mod pc1 pc2_mod pc3_mod pc4_mod pc5_mod" (echo "#data,prep,attrs,bin,learner,a,b,c,d,acc,pd,pf,prec,bal" for data in $datas; do for learner in $learners; do for prep in $preps; do cat $stats | grep -v '#' | grep $data | grep $prep | grep $learner | sort -t, -n -k 14,14 | medians --start 6 done done done ) > $Safe/$me.log cat $Safe/$me.log } demo102() { cd $Tmp local stats="$Safe/demo101.log" [ ! -f "$stats" ] && demo101 cat $stats | grep "##" | flip --data 1 --key 2,5 --performance 14 } demo103() { #zach's stuff local me=demo103 local in=$HOME/wisp/var/Zach/var/whichOut/AUCwithRocky.csv cd $Tmp cat $in > ready.csv local data=`cut -d, -f 1 ready.csv | sort | uniq` local learners=`cut -d, -f 6 ready.csv | sort | uniq` for d in $data; do printf "\n\n===| $d |=======\n" for l in $learners; do echo -n "$l ," grep $l ready.csv | grep $d | cut -d, -f 7 | tee $d$l.done | quartile done | malign | sort -n -t, +3 done | tee $Safe/$me.out echo $Safe/$me.out } # manaul > anything else # standard elarners worse # which whins # demo104() { #zach's stuff cd $Tmp local me=demo104 local in1=$HOME/svns/wisp/var/Zach/var/whichOut/AUCwithMicro.csv local in2=$HOME/svns/wisp/var/Zach/var/whichOut/AUCwithTurkey.csv in=$Tmp/in cat $in1 > $in cat $in2 | sed 's/manual/manualDown/' | sed 's/launam/manualUp/' >> $in #local in=/srv/bronze/zach/wisp/var/Zach/var/whichOut/AUCfinitelist.csv #cd $Tmp cat $in | grep -v rocky | grep -v micro10 | grep -v micro5 | grep -v micro30 | grep -v micro75 | grep -v loc > ready.csv local data=`cut -d, -f 1 ready.csv | grep -v data | grep -v pc3 | sort | uniq` local learners=`cut -d, -f 6 ready.csv | grep -v learner | sort | uniq` for d in "," $data; do printf "\n\n===| $d |=======\n" grep $d ready.csv > ready1.csv for l in $learners ; do echo -n "$l," grep "$l," ready1.csv | cut -d, -f 7 |sort -n | quartile #gawk 'BEGIN {OFS=","} # {N++; F[N] =$1} # END {N1=int(N/2); # if (N % 2) { print N,int(F[N1]) # } else { # N2 = N1+1; # print N,int((F[N1] + F[N2])/2) # }}' done | sort -r -t, -n -k 4 |malign echo "" winLossTie --input ready1.csv --fields 7 --perform 7 --key 6 --99 --high done > $Safe/$me.out echo $Safe/$me.out cd $Here } demo104a() { local in=$Safe/demo104.out [ ! -f $in ] && demo104 cd $Tmp cat $in | quartile2tex > $Safe/demo104a.out cd $Here } # number of controllables demo105() { local me=demo105 local src="$HOME/wisp/var/timm/08/data.csv" ls -lsa "$src" rm -f $Safe/$me_* gawk 'NR==1 {next} {print}' "$src" > $Tmp/data.csv (cd $Tmp local projects=`cut -d, -f 1 data.csv | sort | uniq` local policies=`cut -d, -f 2 data.csv | sort | uniq` local scorings=`cut -d, -f 3 data.csv | sort | uniq` local mutations=`cut -d, -f 4 data.csv | sort | uniq` demo105selector project,policy,MinMedE data.csv | gawk -F, '{OFS=","; $NF=100*$NF; print $0}'> testa (printf "POLICIES\n\n`date`\n\n" for p in overall $projects do grep "$p," testa > relevant printf "\n\n----| $p |----------\n\n" for pol in $policies; do echo -n "$pol," grep ",$pol," relevant | gawk -F, '{print $NF}' | quartile | sort -t, -n +2 done | malign echo "" winLossTie --input relevant --fields 4 --perform 4 --key "3" --99 --low done ) > $Safe/$me_policyAbsoluteE.txt # changing policies rarely effects anything. the final numbers are so low. but Bug? # demo105selector project,policy,FirstMedThreat,MinMedThreat data.csv | # gawk -F, '{OFS=","; print $0,100*($NF/(0.0000000001 + $(NF-1)));}' > testa # printf "HOW MUCH THREAT CONTROL DO WE ACHIEVE?\n\n`date`\n\n" # for p in overall $projects # do grep "$p," testa > relevant # printf "\n\n----| $p |----------\n\n" # for pol in $policies; do # echo -n "$pol," # grep ",$pol," relevant | gawk -F, '{print $NF}' | quartile # done | malign # echo "" # winLossTie --input relevant --fields 6 --perform 6 --key "3" --95 --low # done > $Safe/$me_reduceTHREATS.txt # # usually, healthy reducions in defects # demo105selector project,policy,FirstMedMonths,MinMedMonths data.csv | gawk -F, '{OFS=","; print $0,100*($NF/$(NF-1));}' > testa printf "HOW MUCH MONTHS CONTROL DO WE ACHIEVE?\n\n`date`\n\n" for p in overall $projects do grep "$p," testa > relevant printf "\n\n----| $p |----------\n\n" for pol in $policies; do echo -n "$pol," grep ",$pol," relevant | gawk -F, '{print $NF}' | quartile done | malign | sort -n -t, -n +3 echo "" winLossTie --input relevant --fields 6 --perform 6 --key "3" --99 --low done > $Safe/$me_reduceMonths.txt # usually, healthy reducions in defects demo105selector project,scoring,mutation,numPol data.csv > testa printf "HOW MUCH important ins mutation? ?\n\n`date`\n\n" for p in overall $projects do grep "$p," testa > relevant printf "\n\n----| $p |----------\n\n" for score in $scorings; do for mut in $mutations; do grep ",$score,$mut," relevant | gawk -F, '{print $NF}' | quartile done done | malign | sort -n -t, -n +3 echo "" winLossTie --input relevant --fields 5 --perform 5 --key "3,4" --99 --low done | tee $Safe/$me_mutValue.txt # usually, healthy reducions in defects exit demo105selector project,policy,FirstMedDefects,MinMedDefects data.csv | gawk -F, '{OFS=","; print $0,100*($NF/$(NF-1));}' > testa printf "HOW MUCH DEFECTS CONTROL DO WE ACHIEVE?\n\n`date`\n\n" (printf "HOW MUCH DEFECTS CONTROL DO WE ACHIEVE?\n\n`date`\n\n" for p in overall $projects do grep "$p," testa > relevant printf "\n\n----| $p |----------\n\n" for pol in $policies; do echo -n "$pol," grep ",$pol," relevant | gawk -F, '{print $NF}' | quartile done | malign echo "" winLossTie --input relevant --fields 6 --perform 6 --key "3" --95 --low done ) > $Safe/$me_reduceEffort.txt # usually, healthy reducions in defects demo105selector project,policy,FirstMedEffort,MinMedEffort data.csv | gawk -F, '{OFS=","; print $0,100*($NF/$(NF-1));}' > testa printf "HOW MUCH EFFORT CONTROL DO WE ACHIEVE?\n\n`date`\n\n" (printf "HOW MUCH EFFORT CONTROL DO WE ACHIEVE?\n\n`date`\n\n" for p in overall $projects do grep "$p," testa > relevant printf "\n\n----| $p |----------\n\n" for pol in $policies; do echo -n "$pol," grep ",$pol," relevant | gawk -F, '{print $NF}' | quartile done | malign echo "" winLossTie --input relevant --fields 6 --perform 6 --key "3" --95 --low done ) > $Safe/$me_reduceEffort.txt # usually, healthy reducions in effort # # demo105selector project,policy,FirstMedE,MinMedE data.csv | # gawk -F, '{OFS=","; print $0,100*($NF/$(NF-1));}' > testa # (printf "HOW MUCH ENERGY CONTROL DO WE ACHIEVE?\n\n`date`\n\n" # for p in overall $projects # do grep "$p," testa > relevant # printf "\n\n----| $p |----------\n\n" # for pol in $policies; do # echo -n "$pol," # grep ",$pol," relevant | gawk -F, '{print $NF}' | quartile # done | malign # echo "" # winLossTie --input relevant --fields 6 --perform 6 --key "3" --95 --low # done ) > tee $Safe/$me_reduceEnergy.txt # # # strange- often the energy reduction is close to 0 # # demo105selector project,scoring,mutation,MinMedE,MinSpE data.csv | # gawk -F, '{OFS=",";print $1,$2, $3,$4,100*$5/($6+0.00000000001)}' > testa # (printf "MUTATION SCORING VS RATIO MIN MEDIAN/SPREAD \n\n`date`\n\n" # for p in overall $projects # do grep "$p," testa > relevant # printf "\n\n----| $p |----------\n\n" # for score in $scorings; do # for mut in $mutations; do # echo -n "$score,$mut," # grep ",$score,$mut," relevant | gawk -F, '{print $NF}' | quartile # done # done | malign # echo "" # winLossTie --input relevant --fields 5 --perform 5 --key "3,4" --95 --high # done ) > $Safe/$me_mutationScoringRatio.txt # # # mutation and scoring policies do not effect variance reduction # # demo105selector project,scoring,mutation,numPol data.csv > testa # (printf "MUTATION SCORING VS NUMBER OF pOLICIES\n\n`date`\n\n" # for p in overall $projects # do grep "$p," testa > relevant # printf "\n\n----| $p |----------\n\n" # for score in $scorings; do # for mut in $mutations; do # echo -n "$score,$mut," # grep ",$score,$mut," relevant | gawk -F, '{print $NF}' | quartile # done # done | malign # echo "" # winLossTie --input relevant --fields 5 --perform 5 --key "3,4" --95 --low # done) > $Safe/$me_mutationScoringPolicies.txt # # # mutation and scoring policies do not effect # of policies # demo105selector project,scoring,mutation,FirstMedThreat,MinMedThreat data.csv | gawk -F, '{print $0,",",100*$NF/$(NF-1)}' > testa (printf "MUTATION SCORING VS THREAT REDUCTION\n\n`date`\n\n" for p in overall $projects do grep "$p," testa > relevant printf "\n\n----| $p |----------\n\n" for score in $scorings; do for mut in $mutations; do echo -n "$score,$mut," grep ",$score,$mut," relevant | gawk -F, '{print $NF}' | quartile done done | malign echo "" winLossTie --input relevant --fields 7 --perform 7 --key "3,4" --95 --low done ) > $Safe/$me_mutationScoringThreat.txt # bore|extreme best at defect reduction demo105selector project,scoring,mutation,FirstMedMonths,MinMedMonths data.csv | gawk -F, '{print $0,",",100*$NF/$(NF-1)}' > testa printf "MUTATION SCORING VS MONTHS REDUCTION\n\n`date`\n\n" (printf "MUTATION SCORING VS MONTHS REDUCTION\n\n`date`\n\n" for p in overall $projects do grep "$p," testa > relevant printf "\n\n----| $p |----------\n\n" for score in $scorings; do for mut in $mutations; do echo -n "$score,$mut," grep ",$score,$mut," relevant | gawk -F, '{print $NF}' | quartile done done | malign echo "" winLossTie --input relevant --fields 7 --perform 7 --key "3,4" --95 --low done ) > $Safe/$me_mutationScoringMonths.txt # bore|extreme best at defect reduction demo105selector project,scoring,mutation,FirstMedDefects,MinMedDefects data.csv | gawk -F, '{print $0,",",100*$NF/$(NF-1)}' > testa printf "MUTATION SCORING VS DEFECT REDUCTION\n\n`date`\n\n" (printf "MUTATION SCORING VS DEFECT REDUCTION\n\n`date`\n\n" for p in overall $projects do grep "$p," testa > relevant printf "\n\n----| $p |----------\n\n" for score in $scorings; do for mut in $mutations; do echo -n "$score,$mut," grep ",$score,$mut," relevant | gawk -F, '{print $NF}' | quartile done done | malign echo "" winLossTie --input relevant --fields 7 --perform 7 --key "3,4" --95 --low done ) > $Safe/$me_mutationScoringDefects.txt # bore|extreme best at defect reduction demo105selector project,scoring,mutation,FirstMedEffort,MinMedEffort data.csv | gawk -F, '{print $0,",",100*$NF/$(NF-1)}' > testa printf "MUTATION SCORING VS EFFORT REDUCTION\n\n`date`\n\n" (printf "MUTATION SCORING VS EFFORT REDUCTION\n\n`date`\n\n" for p in overall $projects do grep "$p," testa > relevant printf "\n\n----| $p |----------\n\n" for score in $scorings; do for mut in $mutations; do echo -n "$score,$mut," grep ",$score,$mut," relevant | gawk -F, '{print $NF}' | quartile done done | malign echo "" winLossTie --input relevant --fields 7 --perform 7 --key "3,4" --95 --low done ) > $Safe/$me_mutationScoringEffort.txt # mutation and scoring policies do not effect effort reduction ) } demo105selector() { cat $2 | gawk -F, -v Want=$1 'BEGIN { dd(); args(); } {printf "overall"; for(I=1;I<=N;I++) printf(",%s",$Goals[I]) ; print "";} function args( n,tmp) { N=split(Want,tmp,/,/); for(i=1;i<=N;i++) Goals[i]=Dd[tmp[i]] } function dd() { Dd["project"] = 1; Dd["policy"] = 2; Dd["scoring"] = 3; Dd["mutation"] = 4; Dd["alpha"] = 5; Dd["beta"] = 6; Dd["gamma"] = 7; Dd["delta"] = 8; Dd["relydefect"] = 9; Dd["run"] = 10; Dd["N"] = 11; Dd["SANum"] = 12; Dd["SATime"] = 13; Dd["TotalTime"] = 14; Dd["minEnergy"] = 15; Dd["numPol"] = 16; Dd["FirstMedE"] = 17; Dd["FirstSpE"] = 18; Dd["FirstMedEffort"] = 19; Dd["FirstSpEffort"] = 20; Dd["FirstMedDefects"] = 21; Dd["FirstSpDefects"] = 22; Dd["FirstMedThreat"] = 23; Dd["FirstSpThreat"] = 24; Dd["FirstMedMonths"] = 25; Dd["FirstSpMonths"] = 26; Dd["MinMedE"] = 27; Dd["MinSpE"] = 28; Dd["MinMedEffort"] = 29 Dd["MinSpEffort"] = 30; Dd["MinMedDefects"] = 31; Dd["MinSpDefects"] = 32; Dd["MinMedThreat"] = 33; Dd["MinSpThreat"] = 34; Dd["MinMedMonths"] = 35; Dd["MinSpMonths"] = 36; } ' } lasttimes100() { gawk -F, '{OFS=","; $NF=100*$NF; print $0}' - } demo106() { local me=demo106 local src="$HOME/wisp/var/timm/08/data-sampling.csv" ls -lsa "$src" rm -f $Safe/$me_* gawk 'NR==1 {next} {print}' "$src" > $Tmp/data.csv cd $Tmp local projects=`cut -d, -f 1 data.csv | sort | uniq` local policies=`cut -d, -f 2 data.csv | sort | uniq` local scorings=`cut -d, -f 3 data.csv | sort | uniq` local mutations=`cut -d, -f 4 data.csv | sort | uniq` local ns=`cut -d, -f 11 data.csv | sort | uniq` local sanums=`cut -d, -f 12 data.csv | sort | uniq` echo "ns $ns" echo "sanums $sanums" #demo106selector project,SANum,N,MinSpE,MinMedE,MinSpEffort,MinMedEffort data.csv > testa demo106selector project,SANum,FirstMedEffort,MinMedEffort data.csv | gawk -F, '{OFS=","; print $0, 100*$(NF)/($(NF-1)+ 0.00000001)}'> testa for p in overall $projects do grep "$p," testa > relevant printf "\n\n----| $p |----------\n\n" for a in $sanums; do echo -n "$a," grep ",$a," relevant | gawk -F, '{print $NF}' | quartile | sort -t, -n +2 done | malign | sort -t, -n +3 echo "" echo "n" winLossTie --input relevant --fields 6 --perform 6 --key "3" --95 --low done |tee $Safe/${me}_sa_Effort.txt demo106selector project,SANum,FirstMedDefects,MinMedDefects data.csv | gawk -F, '{OFS=","; print $0, 100*$(NF)/($(NF-1)+ 0.00000001)}'> testa for p in overall $projects do grep "$p," testa > relevant printf "\n\n----| $p |----------\n\n" for a in $sanums; do echo -n "$a," grep ",$a," relevant | gawk -F, '{print $NF}' | quartile | sort -t, -n +2 done | malign | sort -t, -n +3 echo "" echo "n" winLossTie --input relevant --fields 6 --perform 6 --key "3" --95 --low done |tee $Safe/${me}_sa_Defects.txt demo106selector project,SANum,FirstMedMonths,MinMedMonths data.csv | gawk -F, '{OFS=","; print $0, 100*$(NF)/($(NF-1)+ 0.00000001)}'> testa for p in overall $projects do grep "$p," testa > relevant printf "\n\n----| $p |----------\n\n" for a in $sanums; do echo -n "$a," grep ",$a," relevant | gawk -F, '{print $NF}' | quartile | sort -t, -n +2 done | malign | sort -t, -n +3 echo "" echo "n" winLossTie --input relevant --fields 6 --perform 6 --key "3" --95 --low done |tee $Safe/${me}_sa_Months.txt demo106selector project,SANum,FirstMedThreat,MinMedThreat data.csv | gawk -F, '{OFS=","; print $0, 100*$(NF)/($(NF-1)+ 0.00000001)}'> testa for p in overall $projects do grep "$p," testa > relevant printf "\n\n----| $p |----------\n\n" for a in $sanums; do echo -n "$a," grep ",$a," relevant | gawk -F, '{print $NF}' | quartile | sort -t, -n +2 done | malign | sort -t, -n +3 echo "" echo "n" winLossTie --input relevant --fields 6 --perform 6 --key "3" --95 --low done |tee $Safe/${me}_sa_Threat.txt return 0 demo106selector project,N,MinMedThreat,MinSpThreat data.csv | gawk -F, '{OFS=","; print $0, 100*$(NF)/$(NF-1)}'> testa for p in overall $projects do grep "$p," testa > relevant printf "\n\n----| $p |----------\n\n" for n in $ns; do echo -n "$n," grep ",$n," relevant | gawk -F, '{print $NF}' | quartile | sort -t, -n +2 done | malign | sort -t, -n +3 echo "" echo "n" winLossTie --input relevant --fields 6 --perform 6 --key "3" --95 --low done > $Safe/${me}_n_Threat.txt demo106selector project,N,MinMedEffort,MinSpEffort data.csv | gawk -F, '{OFS=","; print $0, 100*$(NF)/$(NF-1)}'> testa for p in overall $projects do grep "$p," testa > relevant printf "\n\n----| $p |----------\n\n" for n in $ns; do echo -n "$n," grep ",$n," relevant | gawk -F, '{print $NF}' | quartile | sort -t, -n +2 done | malign | sort -t, -n +3 echo "" echo "n" winLossTie --input relevant --fields 6 --perform 6 --key "3" --95 --low done > $Safe/${me}_n_effort.txt demo106selector project,N,MinMedDefects,MinSpDefects data.csv | gawk -F, '{OFS=","; print $0, 100*$(NF)/$(NF-1)}'> testa for p in overall $projects do grep "$p," testa > relevant printf "\n\n----| $p |----------\n\n" for n in $ns; do echo -n "$n," grep ",$n," relevant | gawk -F, '{print $NF}' | quartile | sort -t, -n +2 done | malign | sort -t, -n +3 echo "" echo "n" winLossTie --input relevant --fields 6 --perform 6 --key "3" --95 --low done > $Safe/${me}_n_Defects.txt demo106selector project,N,MinMedMonths,MinSpMonths data.csv | gawk -F, '{OFS=","; print $0, 100*$(NF)/$(NF-1)}'> testa for p in overall $projects do grep "$p," testa > relevant printf "\n\n----| $p |----------\n\n" for n in $ns; do echo -n "$n," grep ",$n," relevant | gawk -F, '{print $NF}' | quartile | sort -t, -n +2 done | malign | sort -t, -n +3 echo "" echo "n" winLossTie --input relevant --fields 6 --perform 6 --key "3" --95 --low done > $Safe/${me}_n_Months.txt } demo106selector() { cat $2 | gawk -F, -v Want=$1 'BEGIN { dd(); args(); } {printf "overall"; for(I=1;I<=N;I++) printf(",%s",$Goals[I]) ; print "";} function args( n,tmp) { N=split(Want,tmp,/,/); for(i=1;i<=N;i++) Goals[i]=Dd[tmp[i]] } function dd() { Dd["project"] = 1; Dd["policy"] = 2; Dd["scoring"] = 3; Dd["mutation"] = 4; Dd["alpha"] = 5; Dd["beta"] = 6; Dd["gamma"] = 7; Dd["delta"] = 8; Dd["relydefect"] = 9; Dd["run"] = 10; Dd["N"] = 11; Dd["SANum"] = 12; Dd["SATime"] = 13; Dd["TotalTime"] = 14; Dd["minEnergy"] = 15; Dd["numPol"] = 16; Dd["attNumber"] = 17; Dd["FirstMedE"] = 18; Dd["FirstSpE"] = 19; Dd["FirstMedEffort"] = 20; Dd["FirstSpEffort"] = 21; Dd["FirstMedDefects"] = 22; Dd["FirstSpDefects"] = 23; Dd["FirstMedThreat"] = 24; Dd["FirstSpThreat"] = 25; Dd["FirstMedMonths"] = 26; Dd["FirstSpMonths"] = 27; Dd["MinMedE"] = 28; Dd["MinSpE"] = 29; Dd["MinMedEffort"] = 30 Dd["MinSpEffort"] = 31; Dd["MinMedDefects"] = 32; Dd["MinSpDefects"] = 33; Dd["MinMedThreat"] = 34; Dd["MinSpThreat"] = 35; Dd["MinMedMonths"] = 36; Dd["MinSpMonths"] = 37; } ' } #please add columns for: #a1) the #of decisions required to reach minimum point #a2) the number of variables in "a1". e.g. if "a1"= acap=hi and #acap=lo then "a2" is only "1". #b) the max possible energy (which is always, 1, right?) #c) the min energy reached during SA (so we can see how well our #policies do) #d) bore vs energy scoring #e) number of runs in the SA #f) number of runs for each point in the back select #g) sa runtime #h) total run time (only the "C" not the shell scripts) #i) policy: all, strategic, tactical, hohin's 8 different ideas. #j) spreadE/medianE #i) spreadE and medianE see at the FIRST step of the back select # #have we got the ttests going to allow early stopping? that would #effect (a) # #have we got those new threat tables going so we can avoid dumb ass #mistakes like high tool and low acap? # #what else? we want to say #- that we can do well with a few decision ("a1" and "a2" and "i") #- that the spread of the final results is very small (j) #- that sa is an adequate (if "c" is very low); #- that back select is adequate (medianE/"c") #- that bore is good (that's "d") #- that smaller runs are as good as larger runs ("e" and "f") #- that our tool is fast "g" #- that our "do anything policies" is as good as anything "hohin" #offers ("i") #} # oner nb j48 #auto 56.6 60.4 85.9* #diabetes 57.2 68.5 69.3* # #demo10() { # demo9 | gawk -F, '/@/ {next} # NF>1 {print $NF}' | sort | uniq -c #} #demo11() { # setup; cd $Tmp # demo9 > data.arff # # c=0.1 # printf "confidence limit for pruning = $c (very selective)\n\n" # j4810c data.arff $c | report 0 3,18,16 # # c=0.25 # printf "confidence limit for pruning = $c (default, less selective)\n\n" # j4810c data.arff $c | report 0 3,18,16 # cd $Here #} #demo1001() { # setUpVars # setUpDirs # setUpSeds # prep # cd $Tmp # pwd # makeshare # worker1001 > log # cp log $Safe/demo1.log # winLossTie log | tee $Safe/demo1.winLossTie #} #demo1002() { # setUpVars # setUpDirs # setUpSeds # prep # cd $Tmp # pwd # makeshare # Learners="j48 jrip oner nb aode" # worker1002 > log # cp log $Safe/demo1.log # winLossTie log | tee $Safe/demo1.winLossTie #} #### start up setup build blabln "OurMine version v0.2 (alpha) (c)2007 tim@menzies.us under GPLv3" blabln "Too many doings, not enough learnings.\n"