# /* vim: set filetype=sh : */ # usage: bash our gawkrc ########################################################################## # ourgawk : a simple learning environment for gawk # Copyright (C) 2007, Tim Menzies, tim@menzies.us, http://menzies.us # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, version 3. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . ########################################################################## Here=`pwd` ##### generic stuff reload() { . $Ourrc } show() { local goal1="^$1" local com="/^$1 /,/^}/{print}" if (set | grep $goal1 | grep "=" > /tmp/debug) then set | grep $goal1 else set | gawk "$com" fi } blab() { printf "$*" >&2; } blabln() { printf "$*\n" >&2; } #### initialization stuff setUp() { setUpVars setUpDirs } lcsee() { alias ls="ls --color" } setUpVars() { alias ls="ls -G" PS1="Our GAWK!:\!$ " PROMPT_COMMAND='echo -ne "\033]0;${HOSTNAME}: `pwd`\007"' Ourgawk=$HOME/opt/ourgawk PATH="$Ourgawk/bin:$HOME/bin:$PATH" Safe=$Ourgawk/var/safe #hint for the following questions: man gawk #Q1: what does '--profile' do? #Q1: what does '--dump-variables' do? #Q3: what does '--lint' do? Audit="pgawk --profile=$HOME/tmp/awkprof.out --dump-variables=$HOME/tmp/awkvars.out --lint " } setUpDirs() { mkdir -p $HOME/tmp mkdir -p /tmp/$USER Tmp=`mktemp -d -p /tmp/$USER` mkdir -p $Ourgawk/lib # for support code mkdir -p $Ourgawk/bin # for our executables mkdir -p $HOME/bin # for your executables mkdir -p $Safe # for stuff you want to keep around } #### utilties shuffle() { #Q4: what is an associate array? What is something that _can_ # be an array index in gawk but _cannot_ be an array index in "C"? gawk ' NR==1 {srand(Seed ? Seed : 1)} {Array[rand()]=$0} END {for(I in Array) print Array[I]} ' Seed=$RANDOM - } classes() { #Q5: This script processes an arff file like the one produced # by "weather" (see below). It looks for all the class names # after the "@data" line and prints their frequency. Carefully comment and explain # each line. # Hints: # 1) http://www.delorie.com/gnu/docs/gawk/gawk_116.html # 2) the pattern of processing in this function is repeated # elsewhere in this file gawk ' BEGIN { OFS=FS="," IGNORECASE=1 } { sub(/#.*/,"") } /^[ \t]*$/ { next } Data { Freq[$NF]++ } /@data/ { Data=1 } END { print "#n,x"; for(N in Freq) print Freq[N],N} ' - } asLog() { #Q6: This script checks for numberic attributes. How? gawk ' BEGIN {OFS=","; IGNORECASE=1} /@attribute/ {Attr++} /@attribute/ && $3 ~ /numeric|real|integer|continuous/ { Num[Attr]=Attr; } /@data/ {In=1; FS=","} /@/ {print; next; } In && NF > 2 { for(I in Num) { if ($I !~ /\?/) { $I = ($I < 0.000001) ? 0.000001 : $I; $I = log($I) }} print $0 }' - } someArff() { #Q7: add command-line options to someArff to control the # names of the generated test/train files (currently # train.arff and test.arff). Remember to define default # values for these variables and to update the help # text. Hand in your new definition of "someArff" local bins=3 local bin=1 local seed=$RANDOM while [ `echo $1 | grep "-"` ]; do case $1 in -B|--bins) bins=$2;; -b|--bin) bin=$2;; -s|--seed) seed=$2;; -h|--help) cat <<-EOF someArff : divide an arrf file into Bins, create train/test files usage: someArff [flags] arffFile Flags -B, --bins NUM Randomly divide the data into NUM bins -b, --bin NUM Store bin NUM into test.arff and rest into train.arff -s, --seed NUM Set the random number seed to NUM -h, --help Print this text EOF return 1;; *) blabln "'"$1"' unknown\n usage cat file | someArff [options]" return 1;; esac shift 2 done gawk ' BEGIN { Trainf="train.arff"; Testf="test.arff"; Bins=3; Bin=1; Seed=1; } /^[ \t]*$/ { next } /@relation/ { Seed ? srand(Seed) : srand(1) } /@relation/ { printf "">Trainf; printf "">Testf } /@relation/,/@data/ { print $0 >> Trainf; print $0 >> Testf; next } { Line[rand()] = $0; Lines++ } END { Start = Lines/Bins * (Bin - 1) ; Stop = Lines/Bins * Bin; for(I in Line) { N++; What = (N>= Start && N < Stop) ? Testf : Trainf print Line[I]>>What; } } ' Seed=$seed Bins=$bins Bin=$bin - } instances() { gawk ' BEGIN { IGNORECASE=1 } { sub(/#.*/,"") } # kill comments /^[ \t]*$/ { next } # kill blank likes Collect { N++ } # collect data /@data/ { Collect=1; } # swith to data section END { print N } # report data ' } numerics() { gawk ' BEGIN { IGNORECASE=1 } { sub(/#.*/,"") } /^[ \t]*$/ { next } /@attribute/ && $3 ~ /real|numeric|integer|continuous/ {Nums++} /@data/ { print Nums; exit } ' } malign() { cat - | gawk ' BEGIN { Width=1; Gutter=1; OFS=FS=","; } { N++; for(I=1;I<=NF;I++) { if( (L=length($I)) > Max[I]) Max[I]=L; ++Data[N,0]; Data[N,I]=$I; } } END {for(J=1;J<=N;J++) { Str=Sep1=""; if (Data[J,0]>1) { for(I=1;I<=NF;I++) { L=length(Data[J,I]); Str = Str Sep1 \ str(most(Width,Max[I]+Gutter+1)-L," ") \ Data[J,I]; Sep1= OFS; }} else {Str=Data[J,1]} print Str;} } function str(n,c, out) { while(--n > 0) out = out c; return out; } function most(x,y) { return x > y ? x : y; } ' } uniquesWorker() { true } uniques() { #Q8 : write a function that inputs 'weather' and for each column, outputs # the fequency of each word in that colulm. Hint: if you are processing the column # 'col' and you find then 'word' then if this is the first time you've # seen it, you should store the 'word' on top of a stack of 'Items' for that # column, Sounds complex, right? Well, its just one line: # # if (++Count[col,word] == 1) { Items[col, ++Items[col,0]] = word} weather | uniquesWorker } median() { #Q9: using gawk functions, remove all globals from this code # except FS, NF, Data #Q10: there should be no need to the "delete Val" line in the # folloing code. get rid of it. Hint: # add a function with "val" as a local variable.that is called gawk ' BEGIN{ FS="," } { print } /#/ { next } { for(I=2;I<=NF;I++) { Data[I,0]++; Data[I,Data[I,0]]=$I } } END { printf("#---" ); for(I=2;I<=NF;I++) printf(",-----") print "" printf("#median"); for(I=2;I<=NF;I++) { Max=Data[I,0]; delete Val N=0; for(J=1;J<=Max;J++) Val[J] = Data[I,J] asort(Val); printf(",%s",Val[int(Max/2)]); } print "" }' - } #### some data weather() { cat<<-EOF @relation weather @attribute outlook {sunny, overcast, rainy} @attribute temperature real @attribute humidity real @attribute windy {TRUE, FALSE} @attribute play {yes, no} @data sunny,85,85,FALSE,no sunny,80,90,TRUE,no overcast,83,86,FALSE,yes rainy,70,96,FALSE,yes rainy,68,80,FALSE,yes rainy,65,70,TRUE,no overcast,64,65,TRUE,yes sunny,72,95,FALSE,no sunny,69,70,FALSE,yes rainy,75,80,FALSE,yes sunny,75,70,TRUE,yes overcast,72,90,TRUE,yes overcast,81,75,FALSE,yes rainy,71,91,TRUE,no EOF } demo4data() { cat<<-EOF 10 apple apple apple cat 4 oranges oranges oranges oranges oranges oranges oranges oranges oranges oranges oranges oranges oranges oranges oranges oranges 1 2 1 EOF } demo14data() { cat <<-EOF #file,x,attrs,bin,learner,a,b,c,d,acc,pd,pf,prec,g pc2_mod,loggedDiscrete,16,9,nb,506,0,51,2,90.877,100.000,9.156,3.774,6.474 pc2_mod,loggedDiscrete,16,9,aode,556,2,1,0,99.463,0.000,0.180,0.000,70.711 pc2_mod,loggedDiscrete,16,10,j48,538,1,0,0,99.814,0.000,0.000,0.000,70.711 pc2_mod,loggedDiscrete,16,10,jrip,538,1,0,0,99.814,0.000,0.000,0.000,70.711 pc2_mod,loggedDiscrete,16,10,oner,538,1,0,0,99.814,0.000,0.000,0.000,70.711 pc2_mod,loggedDiscrete,16,10,nb,501,1,37,0,92.950,0.000,6.877,0.000,70.878 pc2_mod,loggedDiscrete,16,10,aode,537,1,1,0,99.629,0.000,0.186,0.000,70.711 #file,x,attrs,bin,learner,a,b,c,d,acc,pd,pf,prec,g pc3_mod,discrete,4,1,j48,137,19,0,0,87.821,0.000,0.000,0.000,70.711 pc3_mod,discrete,4,1,jrip,137,19,0,0,87.821,0.000,0.000,0.000,70.711 pc3_mod,discrete,4,1,oner,137,19,0,0,87.821,0.000,0.000,0.000,70.711 pc3_mod,discrete,4,1,nb,137,19,0,0,87.821,0.000,0.000,0.000,70.711 pc3_mod,discrete,4,1,aode,137,19,0,0,87.821,0.000,0.000,0.000,70.711 EOF } #### demos demo1() { gawk 'BEGIN {print "hello world"}' } demo2() { ls | gawk '{print You " have the file " $0 }' You="$USER" } demo3() { #Q10; is this script broken? when i call it from the command # line using 'demo3', it just hangs. What is going on? gawk ' { You=$0; } END { print "hello " You }' } demo4worker() { #Q11; write down ane explain each part of the following regular expression # shown in 'Number' gawk ' BEGIN { FS=","; Number = "^[+-]?([0-9]+[.]?[0-9]*|[.][0-9]+)([eE][+-]?[0-9]+)?$"; } { Stats = $1 ~ Number ? "is" : "is not"; print "[" $1 "] " Stats " a number" }' } demo4() { demo4data | demo4worker } demo5() { demo4data | gawk ' BEGIN{ OFS=","} { for(I=1;I <=NF;I++) Freq[$I]++ } END { for(N in Freq) print Freq[N], N }' | sort -t, -n -k 1,1 } demo6() { demo4data | gawk ' BEGIN{ OFS=","} { for(I=1;I <=NF;I++) { All++; Freq[$I]++ } } END { print "#n,item,percent,graph" for(N in Freq) { Percent = int(100*Freq[N]/All) print Freq[N], N, Percent, str(Percent) } } function str(n, out) { while(--n > 0) out = out "*"; return out }' | sort -t, -n -k 1,1 } # modify demo9 such that if the first line containts "<" to left justify demo7() { gawk ' function top(a) {return a[a[0]]} function push(a,x, i) {i=++a[0]; a[i]=x; return i} function pop(a, x,i) { i=a[0]--; if (!i) {return ""} else {x=a[i]; delete a[i]; return x}} BEGIN {push(a,1); push(a,2); push(a,3); while(x=pop(a)) print x }' } demo8() { echo "---------------------" demo4data | shuffle echo "---------------------" demo4data | shuffle } demo9() { #Q12: what is different about demo9 to demo5. explain # in detail how that difference is implemented. demo6 | malign } demo10() { weather | classes | malign } demo11() { echo -n "Instances=" weather | instances } demo12() { #Q13: the 'instances' script of demo11 runs thru the whole file # but the 'numerics' script of demo12 stops after '@data'. why? echo -n "Numerics=" weather | numerics } demo13() { #Q14: why does this script change to $Tmp? cd $Tmp Bins=4 weather | someArff --bins $Bins --bin 1 cat train.arff echo "" cat test.arff cd $Here } demo14() { cd $Tmp Seed=$RANDOM Bins=2 for((Bin=1;Bin<=$Bins;Bin++)); do printf "\n---| $Bin |-------------------------\n\n" weather | someArff --bins $Bins --bin $Bin --seed $Seed cat train.arff echo "" cat test.arff done cd $Here } demo15() { #Q15: what is going on here? what does 'median' do? # why is its output going to 'malign'? demo14data | median | malign } demo16() { # Q16: there is an 'escaped' local variable in this script. # (i.e. something that should have been declared local # but the silly programmer forgot to do that). Run # the script and use the output logs generated by 'Audit' # to find that sucker. Fix the program (declare one more local). demo4data | $Audit --source ' BEGIN{ OFS=","} { for(I=1;I <=NF;I++) { All++; Freq[$I]++ } } END { print "#n,item,percent,graph" for(N in Freq) { Percent = int(100*Freq[N]/All) print Freq[N], N, Percent, str(Percent) } } function str(n) { while(--n > 0) out = out "*"; return out }' | sort -t, -n -k 1,1 } demo17() { # Q17: adapt 'someArff' to 'firstArff'. This generates a train # and test set where train includes all of bins {1,2 .. (bin-1)} # and the test set contains bin {i}. true } demo18() { # Q18: adapt 'someArff' to 'nArff' that accepts a command line # argument "N" that returns an arff file containing up to # the first N instances. true } #### start up setUp blabln "OurGawk version v0.1 (c)2007 tim@menzies.us under GPLv3" echo "Tmp=$Tmp" blabln "If you can get your job done with grep don't use SED." blabln "If you can do it with SED don't use AWK." blabln "If you can do it with AWK don't use 'C'.\n"