Java="$Here" Weka="nice -n 20 java -Xmx2048M -cp $Java/weka.jar " Octave="/Applications/Octave.app/Contents/Resources/bin/octave" Seed=1 # COMBA 3 # # Third revision of COMBA using shell-scripting. # This version is updated to reflect current design decisions and needs. # Specifically, in this version only preprocessors and learners are defined by the user. initializeTestArrays() { unset dataList unset preprocessorList unset learnerList unset errorList declare -a dataList declare -a preprocessorList declare -a learnerList declare -a errorList declare -a dNames declare -a pNames declare -a lNames declare -a eNames } runExperiment() { for i1 in ${dataList[@]} do $i1 i2=1 dSize=`getSize` dSize=`expr $dSize + 1` while [ "$i2" -lt "$dSize" ] do getTrain $i2 getTest $i2 i2=`expr $i2 + 1` currentpp="" i3=0 pSize=${#preprocessorList[@]} while [ "$i3" -lt "$pSize" ] do ${preprocessorList[$i3]} i3=`expr $i3 + 1` i4=0 lSize=${#learnerList[@]} while [ "$i4" -lt "$lSize" ] do ${learnerList[$i4]} i4=`expr $i4 + 1` runEval done done done done } comba() { initializeTestArrays # Initialize dataList dataList[0]=d_albrecht dataList[1]=d_china dataList[2]=d_cocomo81 dataList[3]=d_cocomo81e dataList[4]=d_cocomo81o dataList[5]=d_cocomo81s dataList[6]=d_desharnais dataList[7]=d_desharnaisL1 dataList[8]=d_desharnaisL2 dataList[9]=d_desharnaisL3 dataList[10]=d_finnish dataList[11]=d_kemerer dataList[12]=d_maxwell dataList[13]=d_miyazaki94 dataList[14]=d_nasa93center1 dataList[15]=d_nasa93center2 dataList[16]=d_nasa93center5 dataList[17]=d_sdr dataList[18]=d_telecom1 dataList[19]=d_nasa93 # Initialize errorList errorList[0]=e_ar errorList[1]=e_mre errorList[2]=e_mer errorList[3]=e_bre errorList[4]=e_ibre # Define Preprocessors preprocessorList[0]=pp_none preprocessorList[1]=pp_bamboo preprocessorList[2]=pp_sprout preprocessorList[3]=pp_dist # Define Learners # learnerList[0]="l_bambooLISP" #learnerList[0]="l_bamboo" #learnerList[1]="l_sprout" learnerList[0]="l_OctNN 1" learnerList[1]="l_OctNN 2" learnerList[2]="l_OctNN 4" learnerList[3]="l_OctNN 8" #runExperiment #runErrors #runSigtests setNames computeFinalResults } runEval() { setOutfile if [ "$currentpp" != "$preprocessor_name" ]; then preprocessor fi learner } runErrors() { i1=0 dSize=${#dataList[@]} while [ "$i1" -lt "$dSize" ] do ${dataList[$i1]} dNames[$i1]=$data_name i3=0 pSize=${#preprocessorList[@]} while [ "$i3" -lt "$pSize" ] do ${preprocessorList[$i3]} pNames[$i3]=$preprocessor_name i4=0 lSize=${#learnerList[@]} while [ "$i4" -lt "$lSize" ] do ${learnerList[$i4]} lNames[$i4]=$learner_name i5=0 eSize=${#errorList[@]} setOutfile infile=$outfile while [ "$i5" -lt "$eSize" ] do ${errorList[$i5]} eNames[$i5]=$error_name errorEval i5=`expr $i5 + 1` done i4=`expr $i4 + 1` done i3=`expr $i3 + 1` done i1=`expr $i1 + 1` done } setNames() { i1=0 dSize=${#dataList[@]} while [ "$i1" -lt "$dSize" ] do ${dataList[$i1]} dNames[$i1]=$data_name i3=0 pSize=${#preprocessorList[@]} while [ "$i3" -lt "$pSize" ] do ${preprocessorList[$i3]} pNames[$i3]=$preprocessor_name i4=0 lSize=${#learnerList[@]} while [ "$i4" -lt "$lSize" ] do ${learnerList[$i4]} lNames[$i4]=$learner_name i5=0 eSize=${#errorList[@]} while [ "$i5" -lt "$eSize" ] do ${errorList[$i5]} eNames[$i5]=$error_name i5=`expr $i5 + 1` done i4=`expr $i4 + 1` done i3=`expr $i3 + 1` done i1=`expr $i1 + 1` done } runSigtests() { for i1 in ${dNames[@]} do data_name=$i1 for i3 in ${eNames[@]} do error_name=$i3 pSize=${#pNames[@]} lSize=${#lNames[@]} c1=0 while [ "$c1" -lt "$pSize" ] do c2=$c1 while [ "$c2" -lt "$pSize" ] do c3=0 while [ "$c3" -lt "$lSize" ] do c4=$c3 preprocessor_name=${pNames[$c3]} while [ "$c4" -lt "$lSize" ] do preprocessor_name=${pNames[$c1]} learner_name=${lNames[$c3]} setOutfile3 setOutfileW f1=$outfile3 w1=$outfileW preprocessor_name=${pNames[$c2]} learner_name=${lNames[$c4]} setOutfile3 setOutfileW w2=$outfileW if [ "$w1" != "$w2" ]; then echo $w1 $w2 # for debug wilcoxonSRT $f1 $outfile3 $w1 $w2 if [ "$error_name" = "mre" ]; then w3=$w1 w3+=med w4=$w2 w4+=med wilcoxonSRTmd $f1 $outfile3 $w3 $w4 w3=$w1 w3+=pred w4=$w2 w4+=pred wilcoxonSRTpred25 $f1 $outfile3 $w3 $w4 fi fi c4=`expr $c4 + 1` done c3=`expr $c3 + 1` done c2=`expr $c2 + 1` done c1=`expr $c1 + 1` done done done } computeFinalResults() { echo -n "data" >> ./results.txt for i1 in ${pNames[@]} do for i2 in ${lNames[@]} do echo -n $i1"_"$i2"," >> ./results.csv done done echo >> ./results.csv for i1 in ${dNames[@]} do data_name=$i1 echo -n $data_name"," >> ./results.csv split=$i2 for i3 in ${pNames[@]} do preprocessor_name=$i3 for i4 in ${lNames[@]} do learner_name=$i4 sumWSRT done done echo >> ./results.csv done } sumWSRT() { infileW=./results/ infileW+=$data_name infileW+="_" infileW+=$preprocessor_name infileW+="_" infileW+=$learner_name infileW+="_" wins=0 ties=0 losses=0 for i5 in ${eNames[@]} do error_name=$i5 infileW1=$infileW infileW1+=$error_name infileW1+=.mww sumWTL `gawk '{if($1 == 1){ w = w + 1;} if($1 == 0){ t = t + 1;} if($1 == -1){ l = l + 1; }} END{print w " " t " " l}' w=$wins t=$ties l=$losses $infileW1` if [ "$i5" = "mre" ] then infileW2=$infileW1 infileW2+=med sumWTL `gawk '{if($1 == 1){ w = w + 1;} if($1 == 0){ t = t + 1;} if($1 == -1){ l = l + 1; }} END{print w " " t " " l}' w=$wins t=$ties l=$losses $infileW2` infileW2=$infileW1 infileW2+=pred sumWTL `gawk '{if($1 == 1){ w = w + 1;} if($1 == 0){ t = t + 1;} if($1 == -1){ l = l + 1; }} END{print w " " t " " l}' w=$wins t=$ties l=$losses $infileW2` fi done sumwlt=`expr $wins + $ties + $losses` echo -n `gawk -v l=$losses -v s=$sumwlt 'END{print l/s}' results.csv`"," >> ./results.csv } sumWTL() { wins=`expr $1 + $wins` ties=`expr $2 + $ties` losses=`expr $3 + $losses` } errorEval() { #Check if run exists setOutfile2 errorMeasure } setOutfile() { outfile=./results/ outfile+=$data_name outfile+="_" outfile+=$preprocessor_name outfile+="_" outfile+=$learner_name outfile+=.csv } setOutfile2() { outfile2=./results/ outfile2+=$data_name outfile2+="_" outfile2+=$preprocessor_name outfile2+="_" outfile2+=$learner_name outfile2+="_" outfile2+=$error_name outfile2+=.csv } setOutfile3() { outfile3=$Here outfile3+=/results/ outfile3+=$data_name outfile3+="_" outfile3+=$preprocessor_name outfile3+="_" outfile3+=$learner_name outfile3+="_" outfile3+=$error_name outfile3+=.csv } setOutfileW() { outfileW=./results/ outfileW+=$data_name outfileW+="_" outfileW+=$preprocessor_name outfileW+="_" outfileW+=$learner_name outfileW+="_" outfileW+=$error_name outfileW+=.mww } formatFile() { # First column is actual, second is predicted gawk '{ if (NR > 5 && $2 && $3) print $2 "," $3; }' } formatFile2() { sed 's/:/\ /g' | gawk '{ if (NR > 5 && $3 && $7) print $3 "," $5; }' } ########################################################################### # Leave One Out Scripts # getFirst() { echo $1 } getSize() { getFirst `wc -l $data` } getTrain() { sedArgs=$1 sedArgs+="d" sed $sedArgs $data > train.csv } getTest() { sedArgs=$1 sedArgs+="!d" sed $sedArgs $data > test.csv } ########################################################################### # Data Sets # d_albrecht() { data="./data/albrecht.csv" data_name=albrecht } d_china() { data="./data/china.csv" data_name=china } d_cocomo81() { data="./data/cocomo81.csv" data_name=cocomo81 } d_cocomo81e() { data="./data/cocomo81e.csv" data_name=cocomo81e } d_cocomo81o() { data="./data/cocomo81o.csv" data_name=cocomo81o } d_cocomo81s() { data="./data/cocomo81s.csv" data_name=cocomo81s } d_desharnais() { data="./data/desharnais.csv" data_name=desharnais } d_desharnaisL1() { data="./data/desharnaisL1.csv" data_name=desharnaisL1 } d_desharnaisL2() { data="./data/desharnaisL2.csv" data_name=desharnaisL2 } d_desharnaisL3() { data="./data/desharnaisL3.csv" data_name=desharnaisL3 } d_finnish() { data="./data/finnish.csv" data_name=finnish } d_kemerer() { data="./data/kemerer.csv" data_name=kemerer } d_maxwell() { data="./data/maxwell.csv" data_name=maxwell } d_miyazaki94() { data="./data/miyazaki94.csv" data_name=miyazaki94 } d_nasa93() { data="./data/nasa93.csv" data_name=nasa93 } d_nasa93center1() { data="./data/nasa93_center_1.csv" data_name=nasa93center1 } d_nasa93center2() { data="./data/nasa93_center_2.csv" data_name=nasa93center2 } d_nasa93center5() { data="./data/nasa93_center_5.csv" data_name=nasa93center5 } d_sdr() { data="./data/sdr.csv" data_name=sdr } d_telecom1() { data="./data/telecom1.csv" data_name=telecom1 } ########################################################################### # Preprocessors # # None # pp_none() { preprocessor_name=none preprocessor() { currentpp=none cat train.csv > trainP.csv cat test.csv > testP.csv } } # None # pp_bamboo() { preprocessor_name=bamboo preprocessor() { currentpp=bamboo cat test.csv > testP.csv cp test.csv ./supportCode/bamboo_octave/test.csv cp train.csv ./supportCode/bamboo_octave/train.csv bambooPP cp ./supportCode/bamboo_octave/trainP.csv ./trainP.csv } } # None # pp_sprout() { preprocessor_name=sprout preprocessor() { currentpp=sprout cat test.csv > testP.csv cp test.csv ./supportCode/bamboo_octave/test.csv cp train.csv ./supportCode/bamboo_octave/train.csv sproutPP cp ./supportCode/bamboo_octave/trainP.csv ./trainP.csv } } pp_dist() { preprocessor_name=dist preprocessor() { currentpp=dist cat test.csv > testP.csv cp test.csv ./supportCode/bamboo_octave/test.csv cp train.csv ./supportCode/bamboo_octave/train.csv distPP cp ./supportCode/bamboo_octave/trainP.csv ./trainP.csv } } # Logarithmic # pp_log() { preprocessor_name=log #log(e) = 0.434294482 preprocessor() { currentpp=log $Weka weka.filters.unsupervised.attribute.MathExpression -R last -E "ifelse(A=0,0,(log(A))/0.434294482)" -i train.csv -o trainP.csv $Weka weka.filters.unsupervised.attribute.MathExpression -R last -E "ifelse(A=0,0,(log(A))/0.434294482)" -i test.csv -o testP.csv } } ########################################################################### # Leanrers # l_bambooLISP() { learner_name=bambooLISP learner() { injection c2a2 testP2 c2a2 trainP2 cp testP2.arff ./supportCode/BAMBOO/testP.arff cp trainP2.arff ./supportCode/BAMBOO/trainP.arff est=`sbcl --script supportCode/BAMBOO/bamboo_l.lisp` cp testP.csv supportCode/bamboo_octave cp trainP.csv supportCode/bamboo_octave actualE act+="," act+=$est echo $est >> $outfile } } # Analogy Based Estimation - n Nearest Neighbor # l_nNearN() { learner_name=nn learner_name+=$1 lVar=$1 learner() { injection $Weka weka.classifiers.lazy.IBk -K $lVar -t trainP2.csv -T testP2.csv -p 0 | formatFile >> $outfile } } l_OctNN() { learner_name=nn learner_name+=$1 lVar=$1 learner() { cp testP.csv ./supportCode/bamboo_octave cp trainP.csv ./supportCode/bamboo_octave knn $lVar actualE act+="," act+=$est echo $act >> $outfile } } l_bamboo() { learner_name=bamboo learner() { cp testP.csv ./supportCode/bamboo_octave cp trainP.csv ./supportCode/bamboo_octave bamboo actualE act+="," act+=$est echo $act >> $outfile } } l_sprout() { learner_name=sprout learner() { cp testP.csv ./supportCode/bamboo_octave cp trainP.csv ./supportCode/bamboo_octave sprout actualE act+="," act+=$est echo $act >> $outfile } } injection() { # need to inject a fake labeling row cols=`grep -o ',' test.csv | wc -w` iC=1 inject="0," while [ "$iC" -lt "$cols" ] do inject+="0," iC=`expr $iC + 1` done inject+="0" echo $inject > temp cat testP.csv >> temp cat temp > testP2.csv echo $inject > temp cat trainP.csv >> temp cat temp > trainP2.csv #### } ########################################################################### ########################################################################### # Error Calculators # # Absolute Residual Error # e_ar() { error_name=ar errorMeasure() { if [ ! -e $outfile ] || [ -N $rawfile ]; then sed 's/\,/\ /g' $infile | gawk '{if($1 && $2) a = $1 - $2; print (a >= 0) ? a : -a; }' > $outfile2 fi } } # Magnitude of Relative Error # e_mre() { error_name=mre errorMeasure() { if [ ! -e $outfile ] || [ -N $rawfile ]; then sed 's/\,/\ /g' $infile | gawk '{if($1 && $2) a = $1 - $2; print (a >= 0) ? a / $1 : -a / $1; }' > $outfile2 fi } } # Magnitude of Error Relative to the Estimate # e_mer() { error_name=mer errorMeasure() { if [ ! -e $outfile ] || [ -N $rawfile ]; then sed 's/\,/\ /g' $infile | gawk '{if($1 && $2) a = $1 - $2; print (a >= 0) ? a / $2 : -a / $2; }' > $outfile2 fi } } # Balanced Relative Error # e_bre() { error_name=bre errorMeasure() { if [ ! -e $outfile ] || [ -N $rawfile ]; then sed 's/\,/\ /g' $infile | gawk '{if($1 && $2) a = $1 - $2; if ($1 > $2) { print (a >= 0) ? a / $2 : -a / $2;} else {print (a >= 0) ? a / $1 : -a / $1;}}' > $outfile2 fi } } # Inverted Balanced Relative Error # e_ibre() { error_name=ibre errorMeasure() { if [ ! -e $outfile ] || [ -N $rawfile ]; then sed 's/\,/\ /g' $infile | gawk '{if($1 && $2) a = $1 - $2; if ($1 < $2) { print (a >= 0) ? a / $2 : -a / $2;} else {print (a >= 0) ? a / $1 : -a / $1;}}' > $outfile2 fi } } ########################################################################### # Wilcoxon Signed-Rank Test mean comparison# wilcoxonSRT() { # $1 = X values file # $2 = Y values file passArgs=wilcoxonSRT\(\" passArgs+=$1 passArgs+=\" passArgs+=, passArgs+=\" passArgs+=$2 passArgs+=\"\) progLoc=$Here progLoc+=/supportCode wsrt=`echo $passArgs | $Octave -q --path $progLoc | gawk '{print $3}'` echo $wsrt >> $4 echo `expr 0 - $wsrt` >> $3 # 1 is loss for w1, 0 is tie, -1 is win for w1 } # BAMBOO # bamboo() { passArgs="bamboo(\"trainP.csv\" ,\"testP.csv\")" progLoc=$Here progLoc+=/supportCode/bamboo_octave est=`echo $passArgs | $Octave -q --path $progLoc | gawk '{print $3}'` } # BAMBOO PP # bambooPP() { passArgs="bambooPP()" progLoc=$Here progLoc+=/supportCode/bamboo_octave echo $passArgs | $Octave -q --path $progLoc } # SPROUT PP # sproutPP() { passArgs="sproutPP()" progLoc=$Here progLoc+=/supportCode/bamboo_octave echo $passArgs | $Octave -q --path $progLoc } # dist PP # distPP() { passArgs="distPP()" progLoc=$Here progLoc+=/supportCode/bamboo_octave echo $passArgs | $Octave -q --path $progLoc } # sprout # sprout() { passArgs="sprout(\"trainP.csv\" ,\"testP.csv\")" progLoc=$Here progLoc+=/supportCode/bamboo_octave est=`echo $passArgs | $Octave -q --path $progLoc | gawk '{print $3}'` } # kNearestNeighbors # knn() { passArgs="knn(\"trainP.csv\" ,\"testP.csv\"," passArgs+=$1 passArgs+=")" progLoc=$Here progLoc+=/supportCode/bamboo_octave est=`echo $passArgs | $Octave -q --path $progLoc | gawk '{print $3}'` } # actualE # actualE() { passArgs="actual(\"testP.csv\")" progLoc=$Here progLoc+=/supportCode/bamboo_octave act=`echo $passArgs | $Octave -q --path $progLoc | gawk '{print $3}'` } # Wilcoxon Signed-Rank Test median comparison# wilcoxonSRTmd() { # $1 = X values file # $2 = Y values file passArgs=wilcoxonSRTmed\(\" passArgs+=$1 passArgs+=\" passArgs+=, passArgs+=\" passArgs+=$2 passArgs+=\"\) progLoc=$Here progLoc+=/supportCode wsrt=`echo $passArgs | $Octave -q --path $progLoc | gawk '{print $3}'` echo $wsrt >> $4 echo `expr 0 - $wsrt` >> $3 # 1 is loss, 0 is tie, -1 is win } # Wilcoxon Signed-Rank Test pred25 comparison# wilcoxonSRTpred25() { # $1 = X values file # $2 = Y values file passArgs=wilcoxonSRTpred25\(\" passArgs+=$1 passArgs+=\" passArgs+=, passArgs+=\" passArgs+=$2 passArgs+=\"\) progLoc=$Here progLoc+=/supportCode wsrt=`echo $passArgs | $Octave -q --path $progLoc | gawk '{print $3}'` echo $wsrt >> $4 echo `expr 0 - $wsrt` >> $3 # 1 is loss, 0 is tie, -1 is win } ########################################################################### # General Code # # Make override for debug # make() { cd $Here . comba3.bash } # WEKA Wrapper for Debug # weka() { $Weka $1 $2 $3 $4 $5 $6 $7 $8 $9 } # CSV to ARFF conversion # c2a2() { local outarff=$1 local incsv=$outarff incsv+=".csv" outarff+=".arff" local converter="weka.core.converters.CSVLoader" $Weka $converter $incsv | cat > $outarff } # System Opening Messages # echo "COMBA 3 by Vincent Rogers and William Sica" echo "" PS1="COMBA> "