runTimeVsK() { local data=$1 file=$Ourmine/data/results/step/graphs/$2 sizes=`cat $data | grep -v \# | cut -d, -f 5 | sort -n | uniq` (for size in $sizes; do m1=`cat $data | grep genic | gawk 'BEGIN{FS=","} index($5,k)!=0 {print $7}' k=$size - | median` m2=`cat $data | grep kmeans | gawk 'BEGIN{FS=","} index($5,k)!=0 {print $7}' k=$size - | median` m3=`cat $data | grep canopy | gawk 'BEGIN{FS=","} index($5,k)!=0 {print $7}' k=$size - | median` echo "size= $size, m1 = $m1, m2 = $m2, m3 = $m3" line=${size}X${m1}X${m2}X${m3} echo $line | sed 's/X/\t/g' | sed 's/k//g' done) | sort -n > tmp.dat echo "set terminal postscript" > tmp.plt # echo "set log x" >> tmp.plt # echo "set log y" >> tmp.plt echo "set autoscale x" >> tmp.plt echo "set autoscale y" >> tmp.plt echo "set xlabel \"K (number of clusters)\"" >> tmp.plt echo "set ylabel \"Run-time (seconds)\"" >> tmp.plt echo "set output '$file'" >> tmp.plt echo "set title '$3 --- Number of Clusters vs. Run-time'" >> tmp.plt echo "plot 'tmp.dat' using 1:2 with linespoints lt 1 pt 8 t \"genic\"," \ "'tmp.dat' using 1:3 with linespoints lt 1 pt 4 t \"kmeans\"" \ ",'tmp.dat' using 1:4 with linespoints lt 1 pt 6 t \"canopy\"" >> tmp.plt echo "load 'tmp.plt'" | gnuplot rm tmp.plt tmp.dat } runTimeVsN() { local data=$1 file=$Ourmine/data/results/step/graphs/$2 numAtts=`cat $data | grep -v \# | cut -d, -f 10 | sort -n | uniq` (for num in $numAtts; do m1=`cat $data | grep genic | gawk 'BEGIN{FS=","} index($10,n)!=0 {print $7}' n=$num - | median` m2=`cat $data | grep kmeans | gawk 'BEGIN{FS=","} index($10,n)!=0 {print $7}' n=$num - | median` m3=`cat $data | grep canopy | gawk 'BEGIN{FS=","} index($10,n)!=0 {print $7}' n=$num - | median` blabln "attrCount=$num, m1 = $m1, m2 = $m2, m3 = $m3" line=${num}X${m1}X${m2}X${m3} echo $line | sed 's/X/\t/g' | sed 's/n//g' | sed 's/_//g' done) | sort -n > tmp.dat echo "set terminal postscript" > tmp.plt # echo "set log x" >> tmp.plt # echo "set log y" >> tmp.plt echo "set autoscale x" >> tmp.plt echo "set autoscale y" >> tmp.plt echo "set xlabel \"N (number of attributes)\"" >> tmp.plt echo "set ylabel \"Run-time (seconds)\"" >> tmp.plt echo "set output '$file'" >> tmp.plt echo "set title '$3 --- Attribute Count vs. Run-time'" >> tmp.plt echo "plot 'tmp.dat' using 1:2 with linespoints lt 1 pt 8 t \"genic\"," \ "'tmp.dat' using 1:3 with linespoints lt 1 pt 4 t \"kmeans\"" \ ",'tmp.dat' using 1:4 with linespoints lt 1 pt 6 t \"canopy\"" >> tmp.plt echo "load 'tmp.plt'" | gnuplot rm tmp.plt tmp.dat } aggSimVsN() { f1=$Ourmine/data/results/step/step_multiN_full.csv f2=$Ourmine/data/results/step/kmeansDimReduce/pcaKmeans.csv f3=$Ourmine/data/results/step/kmeansDimReduce/icaKmeans.csv f4=$Ourmine/data/results/step/kmeansDimReduce/fmKmeans.csv file=$Ourmine/data/results/step/graphs/$1 numAtts=`cat $f1 | grep -v \# | cut -d, -f 10 | sort -n | uniq` (for num in $numAtts; do #external sim m1=`cat $f1 | grep genic | gawk 'BEGIN{FS=","} index($10,n)!=0 {print $8}' n=$num - | medianBounded 0 1` m2=`cat $f1 | grep kmeans | gawk 'BEGIN{FS=","} index($10,n)!=0 {print $8}' n=$num - | medianBounded 0 1` m3=`cat $f1 | grep canopy | gawk 'BEGIN{FS=","} index($10,n)!=0 {print $8}' n=$num - | medianBounded 0 1` #internal sim m4=`cat $f1 | grep genic | gawk 'BEGIN{FS=","} index($10,n)!=0 {print $9}' n=$num - | medianBounded 0 1` m5=`cat $f1 | grep kmeans | gawk 'BEGIN{FS=","} index($10,n)!=0 {print $9}' n=$num - | medianBounded 0 1` m6=`cat $f1 | grep canopy | gawk 'BEGIN{FS=","} index($10,n)!=0 {print $9}' n=$num - | medianBounded 0 1` blabln "attrCount=$num, m1 = $m1, m2 = $m2, m3 = $m3, m4 = $m4, m5 = $m5, m6 = $m6" line=${num}X${m1}X${m2}X${m3}X${m4}X${m5}X${m6} echo $line | sed 's/X/\t/g' | sed 's/n//g' | sed 's/_//g' done) | sort -n > full.dat aggSimVsN_helper $f2 > pca.dat aggSimVsN_helper $f3 > ica.dat aggSimVsN_helper $f4 > fm.dat echo "set terminal postscript" > tmp.plt # echo "set log x" >> tmp.plt # echo "set log y" >> tmp.plt echo "set autoscale x" >> tmp.plt echo "set autoscale y" >> tmp.plt echo "set xlabel \"N (number of attributes)\"" >> tmp.plt echo "set ylabel \"Similarity\"" >> tmp.plt echo "set output '$file'" >> tmp.plt echo "set title '$2 --- Attribute Count vs. Run-time'" >> tmp.plt # echo "plot 'full.dat' using 1:2 with linespoints lt 1 pt 8 t \"genic: external\"," \ # "'full.dat' using 1:5 with linespoints lt 2 pt 8 t \"internal\"," \ # " 'full.dat' using 1:3 with linespoints lt 1 pt 4 t \"kmeans: external\"," \ # " 'full.dat' using 1:6 with linespoints lt 2 pt 4 t \"internal\"," \ # " 'full.dat' using 1:4 with linespoints lt 1 pt 6 t \"canopy: external\"," \ # " 'full.dat' using 1:7 with linespoints lt 2 pt 6 t \"internal\"," \ echo " plot 'pca.dat' using 1:3 with linespoints lt 1 pt 21 t \"kmeans-PCA: external\"," \ " 'pca.dat' using 1:2 with linespoints lt 2 pt 21 t \"internal\"," \ " 'fm.dat' using 1:3 with linespoints lt 1 pt 16 t \"kmeans-FastMap: external\"," \ " 'fm.dat' using 1:2 with linespoints lt 2 pt 16 t \"internal\"" >> tmp.plt # " 'ica.dat' using 1:3 with linespoints lt 1 pt 19 t \"kmeans-ICA: external\"," \ # " 'ica.dat' using 1:2 with linespoints lt 2 pt 19 t \"internal\"" >> tmp.plt echo "load 'tmp.plt'" | gnuplot # rm tmp.plt tmp.dat } aggSimVsK() { f1=$Ourmine/data/results/step/step_multiN_full.csv f2=$Ourmine/data/results/step/kmeansDimReduce/pcaKmeans.csv f3=$Ourmine/data/results/step/kmeansDimReduce/icaKmeans.csv f4=$Ourmine/data/results/step/kmeansDimReduce/fmKmeans.csv file=$Ourmine/data/results/step/graphs/$1 numAtts=`cat $f1 | grep -v \# | cut -d, -f 5 | sort -n | uniq` (for num in $numAtts; do #external sim m1=`cat $f1 | grep genic | gawk 'BEGIN{FS=","} index($5,n)!=0 {print $8}' n=$num - | medianBounded 0 1` m2=`cat $f1 | grep kmeans | gawk 'BEGIN{FS=","} index($5,n)!=0 {print $8}' n=$num - | medianBounded 0 1` m3=`cat $f1 | grep canopy | gawk 'BEGIN{FS=","} index($5,n)!=0 {print $8}' n=$num - | medianBounded 0 1` #internal sim m4=`cat $f1 | grep genic | gawk 'BEGIN{FS=","} index($5,n)!=0 {print $9}' n=$num - | medianBounded 0 1` m5=`cat $f1 | grep kmeans | gawk 'BEGIN{FS=","} index($5,n)!=0 {print $9}' n=$num - | medianBounded 0 1` m6=`cat $f1 | grep canopy | gawk 'BEGIN{FS=","} index($5,n)!=0 {print $9}' n=$num - | medianBounded 0 1` blabln "attrCount=$num, m1 = $m1, m2 = $m2, m3 = $m3, m4 = $m4, m5 = $m5, m6 = $m6" line=${num}X${m1}X${m2}X${m3}X${m4}X${m5}X${m6} echo $line | sed 's/X/\t/g' | sed 's/k//g' | sed 's/_//g' done) | sort -n > full.dat aggSimVsK_helper $f2 > pca.dat aggSimVsK_helper $f3 > ica.dat aggSimVsK_helper $f4 > fm.dat echo "set terminal postscript" > tmp.plt # echo "set log x" >> tmp.plt # echo "set log y" >> tmp.plt echo "set autoscale x" >> tmp.plt echo "set autoscale y" >> tmp.plt echo "set xlabel \"N (number of attributes)\"" >> tmp.plt echo "set ylabel \"Similarity\"" >> tmp.plt echo "set output '$file'" >> tmp.plt echo "set title '$2 --- Attribute Count vs. Run-time'" >> tmp.plt echo "plot 'full.dat' using 1:2 with linespoints lt 1 pt 8 t \"genic: external\"," \ "'full.dat' using 1:5 with linespoints lt 2 pt 8 t \"internal\"," \ " 'full.dat' using 1:3 with linespoints lt 1 pt 4 t \"kmeans: external\"," \ " 'full.dat' using 1:6 with linespoints lt 2 pt 4 t \"internal\"," \ " 'full.dat' using 1:4 with linespoints lt 1 pt 6 t \"canopy: external\"," \ " 'full.dat' using 1:7 with linespoints lt 2 pt 6 t \"internal\"," \ " 'pca.dat' using 1:3 with linespoints lt 1 pt 21 t \"kmeans-PCA: external\"," \ " 'pca.dat' using 1:2 with linespoints lt 2 pt 21 t \"internal\"," \ " 'fm.dat' using 1:3 with linespoints lt 1 pt 16 t \"kmeans-FastMap: external\"," \ " 'fm.dat' using 1:2 with linespoints lt 2 pt 16 t \"internal\"" >> tmp.plt # " 'ica.dat' using 1:3 with linespoints lt 1 pt 19 t \"kmeans-ICA: external\"," \ # " 'ica.dat' using 1:2 with linespoints lt 2 pt 19 t \"internal\"" >> tmp.plt echo "load 'tmp.plt'" | gnuplot # rm tmp.plt tmp.dat } aggSimVsN_helper(){ numAtts=`cat $1 | grep -v \# | cut -d, -f 3 | sort -n | uniq` (for num in $numAtts; do #internal m1=`cat $1 | gawk 'BEGIN{FS=","} $3==n {print $7 >= 0 ? $7 : -$7}' n=$num - | medianBounded 0 1` #external m2=`cat $1 | gawk 'BEGIN{FS=","} $3==n {print $6 >= 0 ? $6 : -$6}' n=$num - | medianBounded 0 1` blabln "attrCount=$num, m1 = $m1, m2=$m2" line=${num}X${m1}X${m2} echo $line | sed 's/X/\t/g' | sed 's/n//g' | sed 's/_//g' done) | sort -n } aggSimVsK_helper(){ numAtts=`cat $1 | grep -v \# | cut -d, -f 2 | sort -n | uniq` (for num in $numAtts; do #internal m1=`cat $1 | gawk 'BEGIN{FS=","} $2==n {print $7 >= 0 ? $7 : -$7}' n=$num - | medianBounded 0 1` #external m2=`cat $1 | gawk 'BEGIN{FS=","} $2==n {print $6 >= 0 ? $6 : -$6}' n=$num - | medianBounded 0 1` blabln "attrCount=$num, m1 = $m1, m2=$m2" line=${num}X${m1}X${m2} echo $line | sed 's/X/\t/g' | sed 's/n//g' | sed 's/_//g' done) | sort -n } aggRunTimeVsN() { f1=$Ourmine/data/results/step/step_multiN_full.csv f2=$Ourmine/data/results/step/kmeansDimReduce/pcaKmeans.csv f3=$Ourmine/data/results/step/kmeansDimReduce/icaKmeans.csv f4=$Ourmine/data/results/step/kmeansDimReduce/fmKmeans.csv file=$Ourmine/data/results/step/graphs/$1 numAtts=`cat $f1 | grep -v \# | cut -d, -f 10 | sort -n | uniq` (for num in $numAtts; do m1=`cat $f1 | grep genic | gawk 'BEGIN{FS=","} index($10,n)!=0 {print $7}' n=$num - | median` m2=`cat $f1 | grep kmeans | gawk 'BEGIN{FS=","} index($10,n)!=0 {print $7}' n=$num - | median` m3=`cat $f1 | grep canopy | gawk 'BEGIN{FS=","} index($10,n)!=0 {print $7}' n=$num - | median` blabln "attrCount=$num, m1 = $m1, m2 = $m2, m3 = $m3" line=${num}X${m1}X${m2}X${m3} echo $line | sed 's/X/\t/g' | sed 's/n//g' | sed 's/_//g' done) | sort -n > full.dat aggRunTimeVsN_helper $f2 > pca.dat aggRunTimeVsN_helper $f3 > ica.dat aggRunTimeVsN_helper $f4 > fm.dat echo "set terminal postscript" > tmp.plt # echo "set log x" >> tmp.plt # echo "set log y" >> tmp.plt echo "set autoscale x" >> tmp.plt echo "set autoscale y" >> tmp.plt echo "set xlabel \"N (number of attributes)\"" >> tmp.plt echo "set ylabel \"Run-time (seconds)\"" >> tmp.plt echo "set output '$file'" >> tmp.plt echo "set title '$2 --- Attribute Count vs. Run-time'" >> tmp.plt echo "plot 'full.dat' using 1:2 with linespoints lt 1 pt 8 t \"genic\"," \ "'full.dat' using 1:3 with linespoints lt 1 pt 4 t \"kmeans\"," \ "'full.dat' using 1:4 with linespoints lt 1 pt 6 t \"canopy\"," \ "'pca.dat' using 1:2 with linespoints lt 1 pt 21 t \"kmeans - PCA\"," \ "'fm.dat' using 1:2 with linespoints lt 1 pt 16 t \"kmeans - FastMap\"" >> tmp.plt # "'ica.dat' using 1:2 with linespoints lt 1 pt 19 t \"kmeans - ICA\"," \ echo "load 'tmp.plt'" | gnuplot # rm tmp.plt tmp.dat } aggRunTimeVsN_helper(){ numAtts=`cat $1 | grep -v \# | cut -d, -f 3 | sort -n | uniq` (for num in $numAtts; do m2=`cat $1 | gawk 'BEGIN{FS=","} $3==n {print $5 >= 0 ? $5 : -$5}' n=$num - | median` blabln "attrCount=$num, m1 = $m1" line=${num}X${m2} echo $line | sed 's/X/\t/g' | sed 's/n//g' | sed 's/_//g' done) | sort -n } simVsK() { local data=$1 file=$Ourmine/data/results/step/graphs/$2 sizes=`cat $data | grep -v \# | cut -d, -f 5 | sort -n | uniq` (for size in $sizes; do m1=`cat $data | grep genic | gawk 'BEGIN{FS=","} index($5,k)!=0 {print $8}' k=$size - | medianBounded 0 1` m2=`cat $data | grep kmeans | gawk 'BEGIN{FS=","} index($5,k)!=0 {print $8}' k=$size - | medianBounded 0 1` m3=`cat $data | grep canopy | gawk 'BEGIN{FS=","} index($5,k)!=0 {print $8}' k=$size - | medianBounded 0 1` m4=`cat $data | grep genic | gawk 'BEGIN{FS=","} index($5,k)!=0 {print $9}' k=$size - | medianBounded 0 1` m5=`cat $data | grep kmeans | gawk 'BEGIN{FS=","} index($5,k)!=0 {print $9}' k=$size - | medianBounded 0 1` m6=`cat $data | grep canopy | gawk 'BEGIN{FS=","} index($5,k)!=0 {print $9}' k=$size - | medianBounded 0 1` line=${size}X${m1}X${m2}X${m3}X${m4}X${m5}X${m6} echo $line | sed 's/X/\t/g' | sed 's/k//' done) | sort -n > tmp.dat echo "set terminal postscript " > tmp.plt echo "set autoscale x" >> tmp.plt echo "set yrange [0.01:1]" >> tmp.plt echo "set logscale y" >> tmp.plt echo "set xlabel \"k (number of clusters)\"" >> tmp.plt echo "set ylabel \"Similarlity\"" >> tmp.plt echo "set output '$file'" >> tmp.plt echo "set title '$3 - Similarity Vs. K'" >> tmp.plt echo "plot 'tmp.dat' using 1:2 with linespoints lt 1 pt 8 t \"genic: external\"," \ "'tmp.dat' using 1:5 with linespoints lt 2 pt 8 t \"internal \n\"," \ " 'tmp.dat' using 1:3 with linespoints lt 1 pt 4 t \"kmeans: external\"," \ " 'tmp.dat' using 1:6 with linespoints lt 2 pt 4 t \"internal\"," \ " 'tmp.dat' using 1:4 with linespoints lt 1 pt 6 t \"canopy: external\"," \ " 'tmp.dat' using 1:7 with linespoints lt 2 pt 6 t \"internal\"" >> tmp.plt echo "load 'tmp.plt'" | gnuplot rm tmp.dat tmp.plt } kVsNVsSim() { local data=$1 file="kVsNvsExternalSim"$3 ks=`cat $data | grep -v \# | cut -d, -f 5 | sort -n | uniq` ns=`cat $data | grep -v \# | cut -d, -f 10 | sort -n | uniq` (for k in $ks; do for n in $ns; do m1=`cat $data | grep $n | grep $k | grep genic | cut -d, -f9 | medianBounded 0 1` m2=`cat $data | grep $n | grep $k | grep kmeans | cut -d, -f9 | medianBounded 0 1` m3=`cat $data | grep $n | grep $k | grep canopy | cut -d, -f9 | medianBounded 0 1` m4=`cat $data | grep $n | grep $k | grep genic | cut -d, -f8 | medianBounded 0 1` m5=`cat $data | grep $n | grep $k | grep kmeans | cut -d, -f8 | medianBounded 0 1` m6=`cat $data | grep $n | grep $k | grep canopy | cut -d, -f8 | medianBounded 0 1` line=${k}X${n}X${m1}X${m2}X${m3}X${m4}X${m5}X${m6} echo $line | sed 's/X/\t/g' | sed 's/\(n\|_\)//g' | sed 's/k//g' done done) | sort -n > tmp.dat echo "set terminal postscript " > tmp.plt # echo "set autoscale x" >> tmp.plt # echo "set autoscale y" >> tmp.plt # echo "set autoscale z" >> tmp.plt # echo "set zrange [0.01:1]" >> tmp.plt # echo "set logscale y" >> tmp.plt echo "set xlabel \"N (number of attributes)\"" >> tmp.plt echo "set ylabel \"K (number of clusters)\"" >> tmp.plt echo "set zlabel \"Similarlity\"" >> tmp.plt echo "set dgrid3d 30,30" >> tmp.plt # echo "set hidden3d" >> tmp.plt cp tmp.plt tmp_g.plt cp tmp.plt tmp_k.plt cp tmp.plt tmp_c.plt echo "set title '$2 - K vs N vs Similarity - genic'" >> tmp_g.plt f1=$file"_genic.ps" echo "set output '$f1'" >> tmp_g.plt echo "splot 'tmp.dat' u 1:2:3 with lines t \"genic: external\"," \ "'tmp.dat' u 1:2:6 with lines lt 2 t \"internal\"" >> tmp_g.plt echo "load 'tmp_g.plt'" | gnuplot echo "set title '$2 - K vs N vs Similarity - kmeans'" >> tmp_k.plt f2=$file"_kmeans.ps" echo "set output '$f2'" >> tmp_k.plt echo "splot 'tmp.dat' u 1:2:4 with lines t \"kmeans: external\"," \ "'tmp.dat' u 1:2:7 with lines lt 2 t \"internal\"" >> tmp_k.plt echo "load 'tmp_k.plt'" | gnuplot echo "set title '$2 - K vs N vs Similarity - canopy'" >> tmp_c.plt f3=$file"_canopy.ps" echo "set output '$f3'" >> tmp_c.plt echo "splot 'tmp.dat' u 1:2:5 with lines t \"canopy: external\"," \ "'tmp.dat' u 1:2:8 with lines lt 2 t \"internal\"" >> tmp_c.plt echo "load 'tmp_c.plt'" | gnuplot # rm tmp.dat tmp.plt tmp_g.plt tmp_k.plt tmp_c.plt } simVsN() { local data=$1 file=$Ourmine/data/results/step/graphs/$2 sizes=`cat $data | grep -v \# | cut -d, -f 10 | sort -n | uniq` (for size in $sizes; do m1=`cat $data | grep genic | gawk 'BEGIN{FS=","} index($10,n)!=0 {print $8}' n=$size - | medianBounded 0 1` m2=`cat $data | grep kmeans | gawk 'BEGIN{FS=","} index($10,n)!=0 {print $8}' n=$size - | medianBounded 0 1` m3=`cat $data | grep canopy | gawk 'BEGIN{FS=","} index($10,n)!=0 {print $8}' n=$size - | medianBounded 0 1` m4=`cat $data | grep genic | gawk 'BEGIN{FS=","} index($10,n)!=0 {print $9}' n=$size - | medianBounded 0 1` m5=`cat $data | grep kmeans | gawk 'BEGIN{FS=","} index($10,n)!=0 {print $9}' n=$size - | medianBounded 0 1` m6=`cat $data | grep canopy | gawk 'BEGIN{FS=","} index($10,n)!=0 {print $9}' n=$size - | medianBounded 0 1` line=${size}X${m1}X${m2}X${m3}X${m4}X${m5}X${m6} echo $line | sed 's/X/\t/g' | sed 's/\(n\|_\)//g' done) | sort -n > tmp.dat echo "set terminal postscript " > tmp.plt echo "set autoscale x" >> tmp.plt echo "set yrange [0.01:1]" >> tmp.plt echo "set logscale y" >> tmp.plt echo "set xlabel \"N (number of attributes)\"" >> tmp.plt echo "set ylabel \"Similarlity\"" >> tmp.plt echo "set output '$file'" >> tmp.plt echo "set title '$3 - Similarity Vs. N'" >> tmp.plt echo "plot 'tmp.dat' using 1:2 with linespoints lt 1 pt 8 t \"genic: external\"," \ "'tmp.dat' using 1:5 with linespoints lt 2 pt 8 t \"internal \n\"," \ " 'tmp.dat' using 1:3 with linespoints lt 1 pt 4 t \"kmeans: external\"," \ " 'tmp.dat' using 1:6 with linespoints lt 2 pt 4 t \"internal\"," \ " 'tmp.dat' using 1:4 with linespoints lt 1 pt 6 t \"canopy: external\"," \ " 'tmp.dat' using 1:7 with linespoints lt 2 pt 6 t \"internal\"" >> tmp.plt echo "load 'tmp.plt'" | gnuplot rm tmp.dat tmp.plt } speedPlotsPerDS() { local data=$1 local dataSets=`cat $data | grep -v \# | cut -d, -f1 | sort | uniq` for ds in $dataSets; do file=$Ourmine/data/results/step/graphs/'overall_runTime.png' sizes=`cat $data | cut -d, -f 5 | sort -n | uniq` for size in $sizes; do m1=`cat $data | grep genic | gawk 'BEGIN{FS=","} $5==k {print $7}' k=$size - | mean` m2=`cat $data | grep kmeans | gawk 'BEGIN{FS=","} $5==k {print $7}' k=$size - | mean` m3=`cat $data | grep canopy | gawk 'BEGIN{FS=","} $5==k {print $7}' k=$size - | mean` echo "size= $size, m1 = $m1, m2 = $m2, m3 = $m3" line=${size}X${m1}X${m2}X${m3} echo $line | sed 's/X/\t/' | sed 's/X/\t/' | sed 's/X/\t/' | sed 's/X/\t/' >> tmp.dat done echo "set terminal png " > tmp.plt echo "set log x" >> tmp.plt echo "set output '$file'" >> tmp.plt echo "plot 'tmp.dat' using 1:2 with lines lt 3 t \"genic\", 'tmp.dat' using 1:3 with lines lt 2 t \"kmeans\", 'tmp.dat' using 1:4 with lines lt 5 t \"canopy\"" >> tmp.plt echo "set title '$ds --- Run-time Comparison'" >> tmp.plt echo "load 'tmp.plt'" | gnuplot rm tmp.dat tmp.plt mv $file $Ourmine/data/runTimeGraphs/ done } kVsNVsPureEnt() { local data=$1 file="kVsNvsEntropyPurity" ks=`cat $data | grep -v \# | cut -d, -f 3 | sed 's/k//g' | sort -n | uniq` ns=`cat $data | grep -v \# | cut -d, -f 4 | sort -n | uniq` (for k in $ks; do for n in $ns; do ent=`cat $data | grep $n | grep "k"$k | grep genic | cut -d, -f6 | median` pur=`cat $data | grep $n | grep "k"$k | grep kmeans | cut -d, -f7 | median` line=${k}X${n}X${ent}X${pur}X echo $line | sed 's/X/\t/g' | sed 's/dim//g' | sed 's/k//g' done done) | sort -n > tmp.dat echo "set terminal postscript " > tmp.plt # echo "set autoscale x" >> tmp.plt # echo "set autoscale y" >> tmp.plt # echo "set autoscale z" >> tmp.plt # echo "set zrange [0.01:1]" >> tmp.plt # echo "set logscale y" >> tmp.plt echo "set xlabel \"N (number of attributes)\"" >> tmp.plt echo "set ylabel \"K (number of clusters)\"" >> tmp.plt echo "set zlabel \"Purity/Entropy\"" >> tmp.plt echo "set dgrid3d 30,30" >> tmp.plt # echo "set hidden3d" >> tmp.plt echo "set title '$2 - K vs N vs Purity'" >> tmp.plt f1=$file".ps" echo "set output '$f1'" >> tmp.plt echo "splot 'tmp.dat' u 1:2:3 with lines t \"entropy\"," \ "'tmp.dat' u 1:2:4 with lines lt 2 t \"purity\"" >> tmp.plt echo "load 'tmp.plt'" | gnuplot # rm tmp.dat tmp.plt tmp_g.plt tmp_k.plt tmp_c.plt } naturalDatasetSimilarityGraph() { local file=$Safe/orig_sim_clean.csv local meths="tfidf pca fastmap" ns=`cat $file | grep -v \# | cut -d, -f 3 | sort -n | uniq` datas=`cat $file | grep -v \# | cut -d, -f 1 | sort -n | uniq` cd plot_data for data in $datas; do (for n in $ns; do line=${n} for m in $meths; do eSim=`cat $file | grep $n | grep $m | grep $data | cut -d, -f5 | median` iSim=`cat $file | grep $n | grep $m | cut -d, -f4 | median` line=${line}X${eSim}X${iSim} done echo $line | sed 's/X/\t/g' | sed 's/dim//g' | sed 's/k//g' done) | sort -n > $data.tmp.dat local output=$data"_natural_sim" echo "set terminal postscript " > tmp.plt echo "set xrange [3:10000]" >> tmp.plt echo "set xtics 0,5" >> tmp.plt echo "set logscale x" >> tmp.plt echo "set xlabel \"N (number of attributes)\"" >> tmp.plt echo "set ylabel \"Internal & External Similarity\"" >> tmp.plt echo "set title 'Similarity of natural clusterings - $data'" >> tmp.plt echo "set output '$output.eps'" >> tmp.plt echo "plot '$data.tmp.dat' u 1:2 with linespoints lt -1 pt 6 t \"tfidf - external_sim\"," \ "'$data.tmp.dat' u 1:3 with linespoints lt -1 pt 4 t \"tfidf - internal_sim\"," \ "'$data.tmp.dat' u 1:4 with linespoints lt 0 pt 6 t \"pca - external_sim\"," \ "'$data.tmp.dat' u 1:5 with linespoints lt 0 pt 4 t \"pca - internal_sim\"," \ "'$data.tmp.dat' u 1:6 with linespoints lt 1 pt 6 t \"fastmap - external_sim\"," \ "'$data.tmp.dat' u 1:7 with linespoints lt 1 pt 4 t \"fastmap - internal_sim\"" >> tmp.plt echo "load 'tmp.plt'" | gnuplot # rm tmp.dat tmp.plt tmp_g.plt tmp_k.plt tmp_c.plt done } superClusterSplot() { local data=$1 local title=$2 local Xfield=$3 local Xlabel=$4 local Yfield=$5 local Ylabel=$6 local Zlabel=$7 local count=7 local numZs=0 shift $count; while [ $# -gt 0 ]; do Zlabels[$numZs]=$1; Zfields[$numZs]=$2; shift 2 numZs=$(( $numZs + 1 )) done xs=`cat $data | grep -v \# | cut -d, -f $Xfield | sort -n | uniq` ys=`cat $data | grep -v \# | cut -d, -f $Yfield | sort -n | uniq` line="" ( for x in $xs; do for y in $ys; do line=${x}X${y} for i in `seq 0 $(($numZs - 1 ))`; do med=`cat $data | grep $x | grep $y | cut -d, -f ${Zfields[$i]} | median` line=${line}X${med} done echo $line | sed 's/X/\t/g' | sed 's/k//g' | sed 's/d//g' done done ) | sort -n > tmp.dat echo "set terminal postscript eps enhanced color " > tmp.plt # echo "set autoscale x" >> tmp.plt # echo "set autoscale y" >> tmp.plt # echo "set autoscale z" >> tmp.plt # echo "set zrange [0.01:1]" >> tmp.plt # echo "set logscale y" >> tmp.plt echo "set xlabel \"$Xlabel\"" >> tmp.plt echo "set ylabel \"$Ylabel\"" >> tmp.plt echo "set zlabel \"$Zlabel\"" >> tmp.plt echo "set surface" >> tmp.plt echo "set dgrid3d 30,30" >> tmp.plt echo "set title '$title'" >> tmp.plt echo -n "splot 'tmp.dat' u 1:2:3 with lines lt 2 t \"${Zlabels[0]}\"" >> tmp.plt for i in `seq 1 $(( $numZs -1 ))`; do echo -n ", 'tmp.dat' u 1:2:$(($i + 3)) with lines lc $i lt $(($i+2)) t \"${Zlabels[$i]}\"" >> tmp.plt done echo "load 'tmp.plt'" | gnuplot } superClusterPlot() { local data=$1 local title=$2 local Xfield=$3 local Xlabel=$4 local Ylabel=$5 local numZs=0 shift 5; while [ $# -gt 0 ]; do Ylabels[$numZs]=$1; Yfields[$numZs]=$2; shift 2 numZs=$(( $numZs + 1 )) done xs=`cat $data | grep -v \# | cut -d, -f $Xfield | sort -n | uniq` line="" ( for x in $xs; do line=${x} for i in `seq 0 $(($numZs - 1 ))`; do med=`cat $data | grep $x | cut -d, -f ${Yfields[$i]} | median` line=${line}X${med} done echo $line | sed 's/X/\t/g' | sed 's/k//g' | sed 's/d//g' done ) | sort -n > tmp.dat echo "set terminal postscript " > tmp.plt # echo "set autoscale x" >> tmp.plt # echo "set autoscale y" >> tmp.plt # echo "set autoscale z" >> tmp.plt # echo "set zrange [0.01:1]" >> tmp.plt # echo "set logscale y" >> tmp.plt echo "set xlabel \"$Xlabel\"" >> tmp.plt echo "set ylabel \"$Ylabel\"" >> tmp.plt echo "set title '$title'" >> tmp.plt echo -n "plot 'tmp.dat' u 1:2 with lines lt 2 t \"${Ylabels[0]}\"" >> tmp.plt for i in `seq 1 $(( $numZs -1 ))`; do echo -n ", 'tmp.dat' u 1:$(($i + 2)) with lines lt $(($i+2)) t \"${Ylabels[$i]}\"" >> tmp.plt done echo "load 'tmp.plt'" | gnuplot } plot2D(){ buildGnuplotDataFromArff $1 2 > tmp.dat echo "set terminal png " > tmp.plt echo "set output '$3'" >> tmp.plt echo "set title '$1 - $2'" >> tmp.plt echo "plot 'tmp.dat' with points lt 1 pt 12 t \"\"" >> tmp.plt; # echo "load 'save.plt'" >> tmp.plt; echo "load 'tmp.plt" | gnuplot } buildGnuPlotDataFromArff() { gawk 'BEGIN{OFS="\t"} data {line=""; for(i=1; i<=dims) {line=line$i"\t"} print line } /@data/ {data=1}' dims=$2 $1 | sed 's/,/\t/g' } buildGnuPlotDataFromSparff() { gawk 'BEGIN{OFS="\t"} data { s=$0; gsub(/{/, "", s); gsub(/}/, "", s); split(s, ary, /,/); for(d in ary) { split(ary[d], d2, / /) line[d2[1]] = d2[2] } lineStr="" for(i = 0; i tmp.dat; } plot3D(){ buildGnuplotDataFromArff $1 3 > tmp.dat echo "set terminal png " > tmp.plt echo "set output '$3'" >> tmp.plt echo "set title '$1 - $2'" >> tmp.plt echo "splot 'tmp.dat' with points lt 1 pt 12 t \"\"" >> tmp.plt; # echo "load 'save.plt'" >> tmp.plt; echo "load 'tmp.plt" | gnuplot } plot2D_sparff(){ buildGnuplotDataFromSparff $1 2 > tmp.dat echo "set terminal png " > tmp.plt; echo "set output '$3'" >> tmp.plt; echo "set title '$2'" >> tmp.plt; echo "plot 'tmp.dat' with points lt 1 pt 12 t \"\"" >> tmp.plt; echo "load 'tmp.plt" | gnuplot echo "set terminal png " > tmp.plt echo "set output '$3'" >> tmp.plt echo "set title '$1 - $2'" >> tmp.plt echo "plot 'tmp.dat' with points lt 1 pt 12 t \"\"" >> tmp.plt; # echo "load 'save.plt'" >> tmp.plt; echo "load 'tmp.plt" | gnuplot } plot3D_sparff(){ buildGnuplotDataFromSparff $1 3 > tmp.dat echo "set terminal png " > tmp.plt echo "set output '$3'" >> tmp.plt echo "set title '$1 - $2'" >> tmp.plt echo "splot 'tmp.dat' with points lt 1 pt 12 t \"\"" >> tmp.plt; # echo "load 'save.plt'" >> tmp.plt; echo "load 'tmp.plt" | gnuplot } buildColoredClusterPlots() { local datas="ap203 ap214" local reducers="tfidf pca fastmap" local baseFile=~/Dropbox/data/step.csv for d in $datas; do for r in $reducers; do cat $baseFile | grep $d | grep $r ################################# # NOT DONE YET ################################# done done } buildGraphsForTim_reduction(){ local datasets="ap203 ap214 [0-9a-zA-Z]" mkdir graphs_for_tim &> /dev/null cd graphs_for_tim #buildDataFileForPlot XfieldNo YfieldNo [YfieldNo2 YFieldNo3 ... local baseFile=$Ourmine/data/results/step/stepDimReduce/step_dimReduce_runTimes.csv for d in $datasets; do mkdir $d &> /dev/null cd $d cat $baseFile | grep $d > $d.csv cat $d.csv | grep pca > pca.csv cat $d.csv | grep tfidf > tfidf.csv cat $d.csv | grep fastmap > fastmap.csv buildDataFileForPlot pca.csv 3 4 | uniq > pcaN.dat buildDataFileForPlot tfidf.csv 3 4 | uniq > tfidfN.dat buildDataFileForPlot fastmap.csv 3 4 | uniq > fastmapN.dat echo "set terminal postscript eps 'Helvetica' 20" > buildNplot.plt echo "set size 0.5, 0.5" >> buildNplot.plt echo "set xtics 0,20" >> buildNplot.plt echo "set logscale y" >> buildNplot.plt echo "set key bottom right" >> buildNplot.plt echo "set xlabel \"N (number of features)\"" >> buildNplot.plt echo "set ylabel \"Runtime (seconds)\"" >> buildNplot.plt echo "set title '$d - Dimension Reduction - Number of Dimensions vs Runtimes'" >> buildNplot.plt echo "set output 'step_reduction_runtimes_N.eps'" >> buildNplot.plt echo -n "plot 'pcaN.dat' u 1:2 with linesp 0 0 t \"PCA\"" >> buildNplot.plt echo -n ", 'tfidfN.dat' u 1:2 with linesp 1 0 t \"TFIDF\"" >> buildNplot.plt echo -n ", 'fastmapN.dat' u 1:2 with linesp 1 1 t \"Fastmap\"" >> buildNplot.plt echo "load 'buildNplot.plt'" | gnuplot ps2pdf step_reduction_runtimes_N.eps rm $d.csv cd .. done } buildGraphsForTim_clustering(){ local datasets="ap214" mkdir graphs_for_tim_clust &> /dev/null cd graphs_for_tim_clust #buildDataFileForPlot XfieldNo YfieldNo [YfieldNo2 YFieldNo3 ... local baseFile=~/Dropbox/data/step.csv for d in $datasets; do mkdir $d &> /dev/null cd $d cat $baseFile | grep $d > $d.csv cat $d.csv | grep genic > genic.csv cat $d.csv | grep kmeans > kmeans.csv cat $d.csv | grep canopy > canopy.csv buildDataFileForPlot genic.csv 5 8 | uniq > genicN.dat buildDataFileForPlot kmeans.csv 5 8 | uniq > kmeansN.dat buildDataFileForPlot canopy.csv 5 8 | uniq > canopyN.dat echo "set terminal postscript eps 'Helvetica' 20" > buildNplot_clust.plt echo "set size 1, 1" >> buildNplot_clust.plt echo "set xtics 0,20" >> buildNplot_clust.plt echo "set logscale y" >> buildNplot_clust.plt echo "set xlabel \"N (number of features)\"" >> buildNplot_clust.plt echo "set ylabel \"Runtime (seconds)\"" >> buildNplot_clust.plt echo "set title '$d - Clustering - Number of Dimensions vs Runtimes'" >> buildNplot_clust.plt echo "set output 'step_cluster_runtimes_N.eps'" >> buildNplot_clust.plt echo -n "plot 'genicN.dat' u 1:2 with lines lt -1 t \"GenIc\"" >> buildNplot_clust.plt echo -n ", 'kmeansN.dat' u 1:2 with lines lt 2 t \"K-Means\"" >> buildNplot_clust.plt echo -n ", 'canopyN.dat' u 1:2 with lines lt 1 t \"Canopy\"" >> buildNplot_clust.plt echo "load 'buildNplot_clust.plt'" | gnuplot ps2pdf step_cluster_runtimes_N.eps buildDataFileForPlot genic.csv 6 8 | uniq > genicK.dat buildDataFileForPlot kmeans.csv 6 8 | uniq > kmeansK.dat buildDataFileForPlot canopy.csv 6 8 | uniq > canopyK.dat echo "set terminal postscript eps 'Helvetica' 20" > buildKplot_clust.plt echo "set size 1, 1" >> buildKplot_clust.plt echo "set logscale y" >> buildKplot_clust.plt echo "set xlabel \"K (number of clusters)\"" >> buildKplot_clust.plt echo "set ylabel \"Runtime (seconds)\"" >> buildKplot_clust.plt echo "set title '$d - Clustering - Number of Clusters vs Runtimes'" >> buildKplot_clust.plt echo "set output 'step_cluster_runtimes_K.eps'" >> buildKplot_clust.plt echo -n "plot 'genicK.dat' u 1:2 with lines lt -1 t \"GenIc\"" >> buildKplot_clust.plt echo -n ", 'kmeansK.dat' u 1:2 with lines lt 2 t \"K-Means\"" >> buildKplot_clust.plt echo -n ", 'canopyK.dat' u 1:2 with lines lt 1 t \"Canopy\"" >> buildKplot_clust.plt echo "load 'buildKplot_clust.plt'" | gnuplot ps2pdf step_cluster_runtimes_K.eps cd .. done } buildOverlappingSplots() { datas="bbcsports" reductionMethods="pca tfidf fastmap" clusterers="kmeans canopy genic" local metrics=( InternalSimilarity ExternalSimilarity Purity ) local fields=( 10 9 12 ) mkdir plot_data cd plot_data for data in $datas; do for i in `seq 0 2`; do metric=${metrics[$i]} field=${fields[$i]} base=$data"_"$metric"_3d.tmp" name=$data"_"$metric"_3d.eps" echo "set terminal postscript eps enhanced color" > $base.plt # echo "set logscale x" >> $base.plt # echo "set xrange [3:10000]" >> $base.plt echo "set xlabel \"N (number of features)\"" >> $base.plt echo "set ylabel \"K (number of clusters)\"" >> $base.plt echo "set zlabel \"$metric\"" >> $base.plt echo "set title 'Trade offs of Dimensionality Vs Number of Clusters Vs. $metric" >> $base.plt echo "set output \"$name\"" >> $base.plt echo "set surface" >> $base.plt echo "set dgrid3d 30,30" >> $base.plt echo "set hidden3d" >> $base.plt echo "set isosample 40" >> $base.plt j=0 echo -n "" > tmp.file for c in $clusterers; do for r in $reductionMethods; do echo "$metric - $c - $r " cat ~/Dropbox/data/$data.csv_new.csv | grep $c | grep $r > $base.$c.$r.1 buildDataFileForSplot $base.$c.$r.1 5 6 $field > $base.$c.$r.2 echo "'$base.$c.$r.2' u 1:2:3 with lines lc $j t \"$c-$r\"" >> tmp.file j=$(( $j + 1 )) done done cat tmp.file | gawk ' BEGIN{ORS=" " } NR==1 { print "splot " $0; next } { print ", " $0;} ' - > splotfile cat splotfile >> $base.plt echo "load '$base.plt'" | gnuplot echo "$metric done" done done } buildGraphs(){ simVsK $Ourmine/data/results/step/step_multiN_full.csv simVsK.png "STEP Datasets" simVsN $Ourmine/data/results/step/step_multiN_full.csv simVsN.png "STEP Datasets" runTimeVsK $Ourmine/data/results/step/step_multiN_full.csv runTimeVsK.png "STEP Datasets" runTimeVsN $Ourmine/data/results/step/step_multiN_full.csv runTimeVsN.png "STEP Datasets" simVsK $Ourmine/data/results/step/step_multiN_ap203.csv simVsK_ap203.png "AP 203" simVsN $Ourmine/data/results/step/step_multiN_ap203.csv simVsN_ap203.png "AP 203" runTimeVsK $Ourmine/data/results/step/step_multiN_ap203.csv runTimeVsK_ap203.png "AP 203" runTimeVsN $Ourmine/data/results/step/step_multiN_ap203.csv runTimeVsN_ap203.png "AP 203" simVsK $Ourmine/data/results/step/step_multiN_ap214.csv simVsK_ap214.png "AP 214" simVsN $Ourmine/data/results/step/step_multiN_ap214.csv simVsN_ap214.png "AP 214" runTimeVsK $Ourmine/data/results/step/step_multiN_ap214.csv runTimeVsK_ap214.png "AP 214" runTimeVsN $Ourmine/data/results/step/step_multiN_ap214.csv runTimeVsN_ap214.png "AP 214" } makeLatex() { ( echo "\documentclass[12pt]{article}" echo "\usepackage{graphicx}" echo "\begin{document}" echo "\begin{center}" for file in `ls | grep "$1"`; do echo "\includegraphics{$file}" done echo "\end{center}" echo "\end{document}" ) > $2 } buildDataFileForPlot() { #buildDataFileForPlot XfieldNo YfieldNo [YfieldNo2 YFieldNo3 ...] local data=$1 local Xfield=$2 local Yfields[1]=$3 shift 3; local numYs=1 while [ $# -gt 0 ]; do numYs=$(( $numYs + 1 )) Yfields[$numYs]=$1; shift 1 done xs=`cat $data | grep -v \# | cut -d, -f $Xfield | sort -n | uniq` line="" ( for x in $xs; do line=${x} for i in `seq 1 $numYs`; do med=`cat $data | grep $x | cut -d, -f ${Yfields[$i]} | median` line=${line}X${med} done echo $line | sed 's/X/\t/g' | sed 's/k//g' | sed 's/d//g' done ) | sort -n } buildDataFileForSplot() { #buildDataFileForSplot XfieldNo YfieldNo ZfieldNo local data=$1 local Xfield=$2 local Yfield=$3 local Zfield=$4 xs=`cat $data | grep -v \# | cut -d, -f $Xfield | sort -n | uniq` ys=`cat $data | grep -v \# | cut -d, -f $Yfield | sort -n | uniq` line="" ( for x in $xs; do for y in $ys; do med=`cat $data | grep $x | grep $y | cut -d, -f $Zfield | median` line=${x}X${y}X${med} echo $line | sed 's/X/\t/g' | sed 's/k//g' | sed 's/d//g' done done ) | sort -n } plotSuperClusterRunTime() { local datasets="bbcsports ngBias3" local clusterers="kmeans genic canopy" local reducers=( tfidf pca fastmap ) local lts=( -1 2 1 ) mkdir plot_data for d in $datasets; do local file=~/Dropbox/data/$d.csv for c in $clusterers; do base="plot_data/"$d"_"$c".tmp" name=plot_data/$c"_"$d"_"$1.eps echo "set terminal postscript " > $base.plt echo "set logscale x" >> $base.plt echo "set xtics 0,5" >> $base.plt echo "set xrange [3:1500]" >> $base.plt # echo "set xlabel \"N (number of attributes)\"" >> $base.plt echo "set xlabel \"K (number of clusters)\"" >> $base.plt echo "set title 'Trade offs of Runtime Vs Cluster Validity assessments with $c in $d'" >> $base.plt echo "set output \"$name\"" >> $base.plt echo -n "" > $base.all.rt.dat for i in `seq 0 2` do r=${reducers[$i]} lt=${lts[$i]} cat $file | grep $c | grep $r > $base.$r.csv #k rt is pur buildDataFileForPlot $base.$r.csv 6 8 10 12 > $base.$r.dat cat $base.$r.dat | gawk ' {print r"\t"$0}' r=$r - | sed 's/\t/,/g' | cut -d, -f 1,2,3 >> $base.all.rt.dat done #normalize runtimes cat $base.all.rt.dat | cut -d, -f 3 | normalize > $base.rt.norm gawk 'BEGIN {FS=",";} NR==FNR { a[FNR]=$0; next} { print $1", "$2", "a[FNR];}' "$base.rt.norm" "$base.all.rt.dat" > $base.rt.norm.dat for i in `seq 0 2`; do r=${reducers[$i]} lt=${lts[$i]} cat $base.rt.norm.dat | grep $r | cut -d, -f 2,3 | sed 's/, /\t/g' > $base.$r.rt local metrics=( RunTime InternalSimilarity Purity ) local pts=( 7 4 2 ) start=0 if [[ $i == 0 ]]; then echo -n "plot '$base.$r.rt' u 1:2 with linespoints pt 7 lt $lt t \"Runtime - $r\"" >> $base.plt start=1 fi for j in `seq $start 2`; do m=${metrics[$j]} pt=${pts[$j]} if [[ $j == 0 ]]; then echo -n ", '$base.$r.rt' u 1:2 with linespoints pt $pt lt $lt t \"$m - $r\"" >> $base.plt else echo -n ", '$base.$r.dat' u 1:$(($j + 2)) with linespoints pt $pt lt $lt t \"$m - $r\"" >> $base.plt fi done done echo "load '$base.plt'" | gnuplot done done cd plot_data makeLatex "\.eps" $1".tex" echo " dvipdf $1".dvi"" dvipdf $1".dvi" mv $1".pdf" ../ cd .. } plotSuperClusterRunTime3d() { local datasets="bbcsports" local clusterers="kmeans genic canopy" local reducers=( tfidf pca fastmap ) local metrics=( InternalSimilarity ExternalSimilarity Purity ) local lts=( -1 2 1 ) mkdir plot_data for metric in $metrics; do base="plot_data/$d"_"$metric""_3d.tmp" name=plot_data/$d"_"$metric"_3d.eps" echo "set terminal postscript eps enhanced color" > $base.plt # echo "set logscale x" >> $base.plt # echo "set xrange [3:10000]" >> $base.plt echo "set xlabel \"N (number of features)\"" >> $base.plt echo "set ylabel \"K (number of clusters)\"" >> $base.plt echo "set title 'Trade offs of Dimensionality Vs Number of Clusters Vs. $metric'" >> $base.plt echo "set output \"$name\"" >> $base.plt for d in $datasets; do local file=~/Dropbox/data/$d.csv_new.csv for c in $clusterers; do for i in `seq 0 2`; do r=${reducers[$i]} lt=${lts[$i]} cat $file | grep $c | grep $r > $base.csv #k rt is es ent pur buildDataFileForPlot $base.csv 5 8 10 9 11 12 > $base.$r.dat #normalize runtimes cat $base.$r.dat | gawk 'BEGIN{FS="\t"} { print $2 } ' - | normalize > $base.rt.norm gawk 'BEGIN {FS="\t"; OFS="\t"} NR==FNR { a[FNR]=$1; next} { $2=a[FNR]; print $0 }' "$base.rt.norm" "$base.$r.dat" > $base.norm.dat mv $base.norm.dat $base.$r.dat local metrics=( RunTime InternalSimilarity ExternalSimilarity Entropy Purity ) local pts=(7 4 6 2 12 ) start=0 if [[ $i == 0 ]]; then echo -n "plot '$base.$r.dat' u 1:2 with linespoints pt 7 lt $lt t \"Runtime - $r\"" >> $base.plt start=1 fi for j in `seq 0 4`; do m=${metrics[$j]}; pt=${pts[$j]} echo -n ", '$base.$r.dat' u 1:$(($j + 2)) with linespoints pt $pt lt $lt t \"$m - $r\"" >> $base.plt done done done echo "load '$base.plt'" | gnuplot done done # cd plot_data # makeLatex "\.eps" $1".tex" # echo " dvipdf $1".dvi # dvipdf $1".dvi" # mv $1".pdf" ../ # cd ../ } plotSuperCluster() { local dataset=$2 cat $1 | grep tfidf | grep genic > tmp_genicTfidf.csv cat $1 | grep fastmap | grep genic > tmp_genicFastmap.csv cat $1 | grep pca | grep genic > tmp_genicPCA.csv cat $1 | grep tfidf | grep kmeans > tmp_kmeansTfidf.csv cat $1 | grep fastmap | grep kmeans > tmp_kmeansFastmap.csv cat $1 | grep pca | grep kmeans > tmp_kmeansPCA.csv cat $1 | grep tfidf | grep canopy > tmp_canopyTfidf.csv cat $1 | grep fastmap | grep canopy > tmp_canopyFastmap.csv cat $1 | grep pca | grep canopy > tmp_canopyPCA.csv cat $1 | grep genic > tmp_genic.csv cat $1 | grep canopy > tmp_canopy.csv cat $1 | grep kmeans > tmp_kmeans.csv cat $1 | grep fastmap > tmp_Fastmap.csv cat $1 | grep pca > tmp_PCA.csv cat $1 | grep tfidf > tmp_Tfidf.csv #plots #N vs Validity #each treatment with clust validity (both sims, purity, and entropy) on Y superClusterPlot tmp_genicTfidf.csv "Genic_with_TFIDFK_vs_N_vs_Inter/Intra_Sim" 5 "NumDimensions" "VariousValidityMeasures" "InternalSim" 9 "ExternalSim" 10 "Entropy" 11 "Purity" 12 > $dataset"_genic_tfidf_NvsClustValidity.ps" superClusterPlot tmp_genicFastmap.csv "Genic_with_Fastmap_-_N_vs_Inter/Intra_Sim" 5 "NumDimensions" "VariousValidityMeasures" "InternalSim" 9 "ExternalSim" 10 "Entropy" 11 "Purity" 12 > $dataset"_genic_fastmap_NvsClustValidity.ps" superClusterPlot tmp_genicPCA.csv "Genic_with_PCA_-_N_vs_Inter/Intra_Sim" 5 "NumDimensions" "VariousValidityMeasures" "InternalSim" 9 "ExternalSim" 10 "Entropy" 11 "Purity" 12 > $dataset"_genic_pca_NvsClustValidity.ps" superClusterPlot tmp_kmeansTfidf.csv "Kmeans_with_TFIDF_-_N_vs_Inter/Intra_Sim" 5 "NumDimensions" "VariousValidityMeasures" "InternalSim" 9 "ExternalSim" 10 "Entropy" 11 "Purity" 12 > $dataset"_kmeans_tfidf_NvsClustValidity.ps" superClusterPlot tmp_kmeansFastmap.csv "Kmeans_with_Fastmap_-_N_vs_Inter/Intra_Sim" 5 "NumDimensions" "VariousValidityMeasures" "InternalSim" 9 "ExternalSim" 10 "Entropy" 11 "Purity" 12 > $dataset"_kmeans_fastmap_NvsClustValidity.ps" superClusterPlot tmp_kmeansPCA.csv "Kmeans_with_PCA_-_N_vs_Inter/Intra_Sim" 5 "NumDimensions" "VariousValidityMeasures" "InternalSim" 9 "ExternalSim" 10 "Entropy" 11 "Purity" 12 > $dataset"_kmeans_pca_NvsClustValidity.ps" superClusterPlot tmp_canopyTfidf.csv "Canopy_with_TFIDF_-_N_vs_Inter/Intra_Sim" 5 "NumDimensions" "VariousValidityMeasures" "InternalSim" 9 "ExternalSim" 10 "Entropy" 11 "Purity" 12 > $dataset"_canopy_tfidf_NvsClustValidity.ps" superClusterPlot tmp_canopyFastmap.csv "Canopy_with_Fastmap_-_N_vs_Inter/Intra_Sim" 5 "NumDimensions" "VariousValidityMeasures" "InternalSim" 9 "ExternalSim" 10 "Entropy" 11 "Purity" 12 > $dataset"_canopy_fastmap_NvsClustValidity.ps" superClusterPlot tmp_canopyPCA.csv "Canopy_with_PCA_-_N_vs_Inter/Intra_Sim" 5 "NumDimensions" "VariousValidityMeasures" "InternalSim" 9 "ExternalSim" 10 "Entropy" 11 "Purity" 12 > $dataset"_canopy_pca_NvsClustValidity.ps" #K vs Validity superClusterPlot tmp_genicTfidf.csv "Genic_with_TFIDFK_vs_N_vs_Entropy/Purity" 5 "NumDimensions" "VariousValidityMeasures" "Entropy" 11 "Purity" 12 "Entropy" 11 "Purity" 12 > $dataset"_genic_tfidf_KvsEntropyPurity.ps" superClusterPlot tmp_genicFastmap.csv "Genic_with_Fastmap_-_K_vs_N_vs_Entropy/Purity" 5 "NumDimensions" "VariousValidityMeasures" "Entropy" 11 "Purity" 12 "Entropy" 11 "Purity" 12 > $dataset"_genic_fastmap_KvsEntropyPurity.ps" superClusterPlot tmp_genicPCA.csv "Genic_with_PCA_-_K_vs_N_vs_Entropy/Purity" 5 "NumDimensions" "VariousValidityMeasures" "Entropy" 11 "Purity" 12 "Entropy" 11 "Purity" 12 > $dataset"_genic_pca_KvsEntropyPurity.ps" superClusterPlot tmp_kmeansTfidf.csv "Kmeans_with_TFIDF_-_K_vs_N_vs_Entropy/Purity" 5 "NumDimensions" "VariousValidityMeasures" "Entropy" 11 "Purity" 12 "Entropy" 11 "Purity" 12 > $dataset"_kmeans_tfidf_KvsEntropyPurity.ps" superClusterPlot tmp_kmeansFastmap.csv "Kmeans_with_Fastmap_-_K_vs_N_vs_Entropy/Purity" 5 "NumDimensions" "VariousValidityMeasures" "Entropy" 11 "Purity" 12 "Entropy" 11 "Purity" 12 > $dataset"_kmeans_fastmap_KvsEntropyPurity.ps" superClusterPlot tmp_kmeansPCA.csv "Kmeans_with_PCA_-_K_vs_N_vs_Entropy/Purity" 5 "NumDimensions" "VariousValidityMeasures" "Entropy" 11 "Purity" 12 "Entropy" 11 "Purity" 12 > $dataset"_kmeans_pca_KvsEntropyPurity.ps" superClusterPlot tmp_canopyTfidf.csv "Canopy_with_TFIDF_-_K_vs_N_vs_Entropy/Purity" 5 "NumDimensions" "VariousValidityMeasures" "Entropy" 11 "Purity" 12 "Entropy" 11 "Purity" 12 > $dataset"_canopy_tfidf_KvsEntropyPurity.ps" superClusterPlot tmp_canopyFastmap.csv "Canopy_with_Fastmap_-_K_vs_N_vs_Entropy/Purity" 5 "NumDimensions" "VariousValidityMeasures" "Entropy" 11 "Purity" 12 "Entropy" 11 "Purity" 12 > $dataset"_canopy_fastmap_KvsEntropyPurity.ps" superClusterPlot tmp_canopyPCA.csv "Canopy_with_PCA_-_K_vs_N_vs_Entropy/Purity" 5 "NumDimensions" "VariousValidityMeasures" "Entropy" 11 "Purity" 12 "Entropy" 11 "Purity" 12 > $dataset"_canopy_pca_KvsEntropyPurity.ps" #splots #similarity superClusterSplot tmp_genicTfidf.csv "Genic_with_TFIDFK_vs_N_vs_Inter/Intra_Sim" 5 "NumDimensions" 6 "NumClusters" "Similarity" "InternalSim" 9 "ExternalSim" 10 > $dataset"_genic_tfidf_3d_KvNvsBothSims.ps" superClusterSplot tmp_genicFastmap.csv "Genic_with_Fastmap_-_K_vs_N_vs_Inter/Intra_Sim" 5 "NumDimensions" 6 "NumClusters" "Similarity" "InternalSim" 9 "ExternalSim" 10 > $dataset"_genic_fastmap_3d_KvNvsBothSims.ps" superClusterSplot tmp_genicPCA.csv "Genic_with_PCA_-_K_vs_N_vs_Inter/Intra_Sim" 5 "NumDimensions" 6 "NumClusters" "Similarity" "InternalSim" 9 "ExternalSim" 10 > $dataset"_genic_pca_3d_KvNvsBothSims.ps" superClusterSplot tmp_kmeansTfidf.csv "Kmeans_with_TFIDF_-_K_vs_N_vs_Inter/Intra_Sim" 5 "NumDimensions" 6 "NumClusters" "Similarity" "InternalSim" 9 "ExternalSim" 10 > $dataset"_kmeans_tfidf_3d_NvsBothSims.ps" superClusterSplot tmp_kmeansFastmap.csv "Kmeans_with_Fastmap_-_K_vs_N_vs_Inter/Intra_Sim" 5 "NumDimensions" 6 "NumClusters" "Similarity" "InternalSim" 9 "ExternalSim" 10 > $dataset"_kmeans_fastmap_3d_KvNvsBothSims.ps" superClusterSplot tmp_kmeansPCA.csv "Kmeans_with_PCA_-_K_vs_N_vs_Inter/Intra_Sim" 5 "NumDimensions" 6 "NumClusters" "Similarity" "InternalSim" 9 "ExternalSim" 10 > $dataset"_kmeans_pca_3d_KvNvsBothSims.ps" superClusterSplot tmp_canopyTfidf.csv "Canopy_with_TFIDF_-_K_vs_N_vs_Inter/Intra_Sim" 5 "NumDimensions" 6 "NumClusters" "Similarity" "InternalSim" 9 "ExternalSim" 10 > $dataset"_canopy_tfidf_3d_KvNvsBothSims.ps" superClusterSplot tmp_canopyFastmap.csv "Canopy_with_Fastmap_-_K_vs_N_vs_Inter/Intra_Sim" 5 "NumDimensions" 6 "NumClusters" "Similarity" "InternalSim" 9 "ExternalSim" 10 > $dataset"_canopy_fastmap_3d_KvNvsBothSims.ps" superClusterSplot tmp_canopyPCA.csv "Canopy_with_PCA_-_K_vs_N_vs_Inter/Intra_Sim" 5 "NumDimensions" 6 "NumClusters" "Similarity" "InternalSim" 9 "ExternalSim" 10 > $dataset"_canopy_pca_3d_KvNvsBothSims.ps" #purity/entropy superClusterSplot tmp_genicTfidf.csv "Genic_with_TFIDFK_vs_N_vs_Entropy/Purity" 5 "NumDimensions" 6 "NumClusters" "Entropy/Purity" "Entropy" 11 "Purity" 12 > $dataset"_genic_tfidf_3d_KvNvsEntropyPurity.ps" superClusterSplot tmp_genicFastmap.csv "Genic_with_Fastmap_-_K_vs_N_vs_Entropy/Purity" 5 "NumDimensions" 6 "NumClusters" "Entropy/Purity" "Entropy" 11 "Purity" 12 > $dataset"_genic_fastmap_3d_KvNvsEntropyPurity.ps" superClusterSplot tmp_genicPCA.csv "Genic_with_PCA_-_K_vs_N_vs_Entropy/Purity" 5 "NumDimensions" 6 "NumClusters" "Entropy/Purity" "Entropy" 11 "Purity" 12 > $dataset"_genic_pca_3d_KvNvsEntropyPurity.ps" superClusterSplot tmp_kmeansTfidf.csv "Kmeans_with_TFIDF_-_K_vs_N_vs_Entropy/Purity" 5 "NumDimensions" 6 "NumClusters" "Entropy/Purity" "Entropy" 11 "Purity" 12 > $dataset"_kmeans_tfidf_3d_NvsEntropyPurity.ps" superClusterSplot tmp_kmeansFastmap.csv "Kmeans_with_Fastmap_-_K_vs_N_vs_Entropy/Purity" 5 "NumDimensions" 6 "NumClusters" "Entropy/Purity" "Entropy" 11 "Purity" 12 > $dataset"_kmeans_fastmap_3d_KvNvsEntropyPurity.ps" superClusterSplot tmp_kmeansPCA.csv "Kmeans_with_PCA_-_K_vs_N_vs_Entropy/Purity" 5 "NumDimensions" 6 "NumClusters" "Entropy/Purity" "Entropy" 11 "Purity" 12 > $dataset"_kmeans_pca_3d_KvNvsEntropyPurity.ps" superClusterSplot tmp_canopyTfidf.csv "Canopy_with_TFIDF_-_K_vs_N_vs_Entropy/Purity" 5 "NumDimensions" 6 "NumClusters" "Entropy/Purity" "Entropy" 11 "Purity" 12 > $dataset"_canopy_tfidf_3d_KvNvsEntropyPurity.ps" superClusterSplot tmp_canopyFastmap.csv "Canopy_with_Fastmap_-_K_vs_N_vs_Entropy/Purity" 5 "NumDimensions" 6 "NumClusters" "Entropy/Purity" "Entropy" 11 "Purity" 12 > $dataset"_canopy_fastmap_3d_KvNvsEntropyPurity.ps" superClusterSplot tmp_canopyPCA.csv "Canopy_with_PCA_-_K_vs_N_vs_Entropy/Purity" 5 "NumDimensions" 6 "NumClusters" "Entropy/Purity" "Entropy" 11 "Purity" 12 > $dataset"_canopy_pca_3d_KvNvsEntropyPurity.ps" superClusterPlot tmp_kmeans.csv "All_Kmeans_-_K_vs_vs_ClusterValidity" 6 "NumClusters" "ClusterValidity" "InternalSim" 9 "ExternalSim" 10 "Entropy" 11 "Purity" 12 > $dataset"_kmeans_2d_KvNvsEntropyPurity.ps" superClusterPlot tmp_canopy.csv "All_Canopy_-_K_vs_vs_ClusterValidity" 6 "NumClusters" "ClusterValidity" "InternalSim" 9 "ExternalSim" 10 "Entropy" 11 "Purity" 12 > $dataset"_canopy_2d_KvNvsEntropyPurity.ps" superClusterPlot tmp_genic.csv "All_Genic_-_K_vs_vs_ClusterValidity" 6 "NumClusters" "ClusterValidity" "InternalSim" 9 "ExternalSim" 10 "Entropy" 11 "Purity" 12 > $dataset"_genic_2d_KvNvsEntropyPurity.ps" superClusterPlot tmp_Tfidf.csv "All_TFIDF_-_K_vs_vs_ClusterValidity" 6 "NumClusters" "ClusterValidity" "InternalSim" 9 "ExternalSim" 10 "Entropy" 11 "Purity" 12 > $dataset"_tfidf_2d_KvNvsEntropyPurity.ps" superClusterPlot tmp_PCA.csv "All_PCA_-_K_vs_vs_ClusterValidity" 6 "NumClusters" "ClusterValidity" "InternalSim" 9 "ExternalSim" 10 "Entropy" 11 "Purity" 12 > $dataset"_pca_2d_KvNvsEntropyPurity.ps" superClusterPlot tmp_Fastmap.csv "All_Fastmap_-_K_vs_vs_ClusterValidity" 6 "NumClusters" "ClusterValidity" "InternalSim" 9 "ExternalSim" 10 "Entropy" 11 "Purity" 12 > $dataset"_fastmap_2d_KvNvsEntropyPurity.ps" superClusterPlot tmp_kmeans.csv "All_Kmeans_-_N_vs_vs_ClusterValidity" 5 "NumDimensions" "ClusterValidity" "InternalSim" 9 "ExternalSim" 10 "Entropy" 11 "Purity" 12 > $dataset"_kmeans_2d_KvNvsEntropyPurity.ps" superClusterPlot tmp_canopy.csv "All_Canopy_-_N_vs_vs_ClusterValidity" 5 "NumDimensions" "ClusterValidity" "InternalSim" 9 "ExternalSim" 10 "Entropy" 11 "Purity" 12 > $dataset"_canopy_2d_KvNvsEntropyPurity.ps" superClusterPlot tmp_genic.csv "All_Genic_-_N_vs_vs_ClusterValidity" 5 "NumDimensions" "ClusterValidity" "InternalSim" 9 "ExternalSim" 10 "Entropy" 11 "Purity" 12 > $dataset"_genic_2d_KvNvsEntropyPurity.ps" superClusterPlot tmp_Tfidf.csv "All_TFIDF_-_N_vs_vs_ClusterValidity" 5 "NumDimensions" "ClusterValidity" "InternalSim" 9 "ExternalSim" 10 "Entropy" 11 "Purity" 12 > $dataset"_tfidf_2d_KvNvsEntropyPurity.ps" superClusterPlot tmp_PCA.csv "All_PCA_-_N_vs_vs_ClusterValidity" 5 "NumDimensions" "ClusterValidity" "InternalSim" 9 "ExternalSim" 10 "Entropy" 11 "Purity" 12 > $dataset"_pca_2d_KvNvsEntropyPurity.ps" superClusterPlot tmp_Fastmap.csv "All_Fastmap_-_N_vs_vs_ClusterValidity" 5 "NumDimensions" "ClusterValidity" "InternalSim" 9 "ExternalSim" 10 "Entropy" 11 "Purity" 12 > $dataset"_fastmap_2d_KvNvsEntropyPurity.ps" }