1 # /* vim: set filetype=sh : */
  2 # usage: bash our gawkrc
  3 ##########################################################################
  4 #    ourgawk : a simple learning environment for gawk
  5 #    Copyright (C) 2007, Tim Menzies, tim@menzies.us, http://menzies.us
  6 #
  7 #    This program is free software: you can redistribute it and/or modify
  8 #    it under the terms of the GNU General Public License as published by
  9 #    the Free Software Foundation, version 3.
 10 #
 11 #    This program is distributed in the hope that it will be useful,
 12 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 #    GNU General Public License for more details.
 15 #
 16 #    You should have received a copy of the GNU General Public License
 17 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 18 ##########################################################################
 19
 20 Here=`pwd`
 21
 22 ##### generic stuff
 23 reload() { 
 24     . $Ourrc  
 25 }
 26 show() {
 27     local goal1="^$1"
 28     local com="/^$1 /,/^}/{print}"
 29     if   (set | grep $goal1 | grep "=" > /tmp/debug)
 30     then set  | grep $goal1
 31     else set  | gawk "$com"
 32     fi
 33 }
 34 blab()   { printf "$*"   >&2; }
 35 blabln() { printf "$*\n" >&2; }
 36
 37 #### initialization stuff
 38 setUp() {
 39     setUpVars
 40     setUpDirs
 41 }
 42 lcsee() {
 43     alias ls="ls --color"
 44 }
 45 setUpVars() {
 46     alias ls="ls -G"
 47     PS1="Our GAWK!:\!$ "
 48     PROMPT_COMMAND='echo -ne "\033]0;${HOSTNAME}: `pwd`\007"'
 49     Ourgawk=$HOME/opt/ourgawk
 50     PATH="$Ourgawk/bin:$HOME/bin:$PATH"
 51     Safe=$Ourgawk/var/safe
 52     #hint for the following questions: man gawk
 53     #Q1: what does '--profile' do?
 54     #Q1: what does '--dump-variables' do?
 55     #Q3: what does '--lint' do?
 56     Audit="pgawk --profile=$HOME/tmp/awkprof.out
 57                  --dump-variables=$HOME/tmp/awkvars.out
 58                  --lint "
 59 }
 60 setUpDirs() {
 61     mkdir -p $HOME/tmp
 62     mkdir -p /tmp/$USER 
 63     Tmp=`mktemp -d -p /tmp/$USER`
 64     mkdir -p $Ourgawk/lib # for support code
 65     mkdir -p $Ourgawk/bin # for our executables
 66     mkdir -p $HOME/bin  # for your executables
 67     mkdir -p $Safe      # for stuff you want to keep around
 68 }
 69 #### utilties
 70 shuffle() { 
 71     #Q4: what is an associate array? What is something that _can_
 72     #    be an array index in gawk but _cannot_ be an array index in "C"?
 73     gawk '
 74     NR==1 {srand(Seed ? Seed : 1)}
 75           {Array[rand()]=$0}
 76     END   {for(I in Array) print Array[I]}
 77     ' Seed=$RANDOM -
 78 }
 79 classes() { 
 80     #Q5: This script processes an arff file like the one produced
 81     #    by "weather" (see below). It looks for all the class names
 82     #    after the "@data" line and prints their frequency. Carefully comment and explain
 83     #    each line.
 84     #     Hints:
 85     #          1) http://www.delorie.com/gnu/docs/gawk/gawk_116.html
 86     #          2) the pattern of processing in this function is repeated
 87     #             elsewhere in this file
 88     gawk '
 89    BEGIN      { OFS=FS=","
 90                 IGNORECASE=1  }
 91               { sub(/#.*/,"") }
 92    /^[ \t]*$/ { next          }
 93    Data       { Freq[$NF]++ }
 94    /@data/    { Data=1 }
 95    END        { print "#n,x";
 96                 for(N in Freq) print Freq[N],N}
 97    ' -
 98 }
 99 asLog() { 
100     #Q6: This script checks for numberic attributes. How?
101     gawk '
102
103     BEGIN                      {OFS=","; IGNORECASE=1}
104     /@attribute/               {Attr++}
105     /@attribute/ && $3 ~ /numeric|real|integer|continuous/ {
106                                   Num[Attr]=Attr; }
107     /@data/                     {In=1; FS=","}
108     /@/                         {print; next; }
109     In && NF > 2 {
110         for(I in Num) {
111            if ($I !~ /\?/) {
112               $I =  ($I < 0.000001) ?  0.000001 : $I;
113                 $I = log($I) }}
114         print $0
115     }' -
116 }
117 someArff() { 
118     #Q7: add command-line options to someArff to control the
119     #    names of the generated test/train files (currently
120     #    train.arff and test.arff). Remember to define default
121     #    values for these variables and to update the help
122     #    text. Hand in your new definition of "someArff"
123     local bins=3
124     local bin=1
125     local seed=$RANDOM
126     while [ `echo $1 | grep "-"` ]; do
127         case $1 in
128             -B|--bins) bins=$2;;
129             -b|--bin) bin=$2;;
130             -s|--seed) seed=$2;;
131             -h|--help) cat <<-EOF
132             someArff : divide an arrf file into Bins, create train/test files  
133             usage: someArff [flags] arffFile
134
135             Flags
136             -B, --bins NUM   Randomly divide the data into NUM bins
137             -b, --bin  NUM   Store bin NUM into test.arff and rest into train.arff
138             -s, --seed NUM   Set the random number seed to NUM
139             -h, --help       Print this text
140             EOF
141             return 1;;
142             *)   blabln "'"$1"' unknown\n usage cat file | someArff [options]"
143                  return 1;;
144         esac
145         shift 2
146     done
147     gawk '
148     BEGIN  {
149       Trainf="train.arff"; Testf="test.arff";
150       Bins=3;
151       Bin=1;
152       Seed=1;
153    }
154    /^[ \t]*$/          { next }
155    /@relation/         { Seed ? srand(Seed) : srand(1)      }
156    /@relation/         { printf "">Trainf;  printf "">Testf }
157    /@relation/,/@data/ { print $0 >> Trainf;  print $0 >> Testf; next }
158                        { Line[rand()] = $0; Lines++ }
159   END {
160     Start = Lines/Bins * (Bin - 1) ;
161     Stop  = Lines/Bins * Bin;
162     for(I in Line) {
163        N++;
164        What = (N>= Start && N < Stop) ? Testf : Trainf
165        print Line[I]>>What; }
166    }
167    ' Seed=$seed Bins=$bins Bin=$bin -
168 }
169 instances() {
170     gawk '
171    BEGIN      { IGNORECASE=1  }
172               { sub(/#.*/,"") } # kill comments
173    /^[ \t]*$/ { next          } # kill blank likes
174    Collect    { N++           } # collect data
175    /@data/    { Collect=1;    } # swith to data section
176    END        { print N       } # report data
177   '
178 }
179 numerics() {
180     gawk '
181    BEGIN      { IGNORECASE=1  }
182               { sub(/#.*/,"") }
183    /^[ \t]*$/ { next          }
184    /@attribute/ && $3 ~ /real|numeric|integer|continuous/ {Nums++}
185    /@data/    { print Nums; exit       }
186   '
187 }
188 malign() {
189     cat - | gawk '
190     BEGIN { Width=1;
191             Gutter=1;
192             OFS=FS=",";
193     }       
194     { N++;  
195       for(I=1;I<=NF;I++) {
196             if( (L=length($I)) > Max[I]) Max[I]=L;
197             ++Data[N,0];
198             Data[N,I]=$I; }
199     }
200     END {for(J=1;J<=N;J++) {
201             Str=Sep1="";
202             if (Data[J,0]>1) {  
203                 for(I=1;I<=NF;I++) {
204                     L=length(Data[J,I]);
205                     Str = Str Sep1 \
206                           str(most(Width,Max[I]+Gutter+1)-L," ") \
207                           Data[J,I];
208                     Sep1= OFS;
209                 }}
210             else {Str=Data[J,1]}
211           print Str;}
212     }
213     function str(n,c,  out) { while(--n > 0) out = out c; return out; }    
214     function most(x,y)      { return x > y ? x : y; }  
215     ' 
216 }
217 uniquesWorker() {
218     true
219 }
220 uniques() {
221     #Q8 : write a function that inputs 'weather' and for each column, outputs
222     #     the fequency of each word in that colulm. Hint: if you are processing the column
223     #     'col' and you find then 'word' then if this is the first time you've
224     #     seen it, you should store the 'word' on top of a stack of 'Items' for that
225     #      column, Sounds complex, right? Well, its just one line:
226     #
227     #     if (++Count[col,word] == 1) { Items[col, ++Items[col,0]] = word}
228     weather | uniquesWorker
229 }
230 median() { 
231     #Q9: using gawk functions, remove all globals from this code
232     #    except FS, NF, Data
233     #Q10: there should be no need to the "delete Val" line in the
234     #     folloing code. get rid of it. Hint:
235     #     add a function with "val" as a local variable.that is called
236     
237     gawk '
238     BEGIN{ FS="," }
239          { print }
240     /#/  { next }
241          { for(I=2;I<=NF;I++) {
242                  Data[I,0]++;
243                 Data[I,Data[I,0]]=$I  }
244          }
245     END  { printf("#---" );
246            for(I=2;I<=NF;I++) printf(",-----")
247            print ""
248            printf("#median");
249            for(I=2;I<=NF;I++) {
250               Max=Data[I,0];
251                 delete Val
252                 N=0;
253                 for(J=1;J<=Max;J++)
254                    Val[J]  = Data[I,J]
255                    asort(Val);
256                    printf(",%s",Val[int(Max/2)]);
257                 }
258            print ""
259         }' -
260 }
261 #### some data
262 weather() {
263     cat<<-EOF
264     @relation weather
265     @attribute outlook {sunny, overcast, rainy}
266     @attribute temperature real
267     @attribute humidity real
268     @attribute windy {TRUE, FALSE}
269     @attribute play {yes, no}
270     
271     @data
272     sunny,85,85,FALSE,no
273     sunny,80,90,TRUE,no
274     overcast,83,86,FALSE,yes
275     rainy,70,96,FALSE,yes
276     rainy,68,80,FALSE,yes
277     rainy,65,70,TRUE,no
278     overcast,64,65,TRUE,yes
279     sunny,72,95,FALSE,no
280     sunny,69,70,FALSE,yes
281     rainy,75,80,FALSE,yes
282     sunny,75,70,TRUE,yes
283     overcast,72,90,TRUE,yes
284     overcast,81,75,FALSE,yes
285     rainy,71,91,TRUE,no
286 EOF
287 }
288 demo4data() {
289         cat<<-EOF 
290         10
291         apple
292         apple
293         apple
294         cat
295         4
296         oranges
297         oranges
298         oranges
299         oranges
300         oranges
301         oranges
302         oranges
303         oranges
304         oranges
305         oranges
306         oranges
307         oranges
308         oranges
309         oranges
310         oranges
311         oranges
312         1
313         2
314         1
315     EOF
316 }
317 demo14data() { cat <<-EOF
318     #file,x,attrs,bin,learner,a,b,c,d,acc,pd,pf,prec,g
319     pc2_mod,loggedDiscrete,16,9,nb,506,0,51,2,90.877,100.000,9.156,3.774,6.474
320     pc2_mod,loggedDiscrete,16,9,aode,556,2,1,0,99.463,0.000,0.180,0.000,70.711
321     pc2_mod,loggedDiscrete,16,10,j48,538,1,0,0,99.814,0.000,0.000,0.000,70.711
322     pc2_mod,loggedDiscrete,16,10,jrip,538,1,0,0,99.814,0.000,0.000,0.000,70.711
323     pc2_mod,loggedDiscrete,16,10,oner,538,1,0,0,99.814,0.000,0.000,0.000,70.711
324     pc2_mod,loggedDiscrete,16,10,nb,501,1,37,0,92.950,0.000,6.877,0.000,70.878
325     pc2_mod,loggedDiscrete,16,10,aode,537,1,1,0,99.629,0.000,0.186,0.000,70.711
326     #file,x,attrs,bin,learner,a,b,c,d,acc,pd,pf,prec,g
327     pc3_mod,discrete,4,1,j48,137,19,0,0,87.821,0.000,0.000,0.000,70.711
328     pc3_mod,discrete,4,1,jrip,137,19,0,0,87.821,0.000,0.000,0.000,70.711
329     pc3_mod,discrete,4,1,oner,137,19,0,0,87.821,0.000,0.000,0.000,70.711
330     pc3_mod,discrete,4,1,nb,137,19,0,0,87.821,0.000,0.000,0.000,70.711
331     pc3_mod,discrete,4,1,aode,137,19,0,0,87.821,0.000,0.000,0.000,70.711
332     EOF
333 }
334 #### demos
335 demo1() {
336     gawk 'BEGIN {print "hello world"}'
337 }
338 demo2() {
339     ls | gawk '{print You " have the file " $0 }' You="$USER"
340 }
341 demo3() {
342     #Q10;  is this script broken? when i call it from the command
343     #      line using 'demo3', it just hangs. What is going on?
344
345     gawk '    { You=$0; }
346           END { print "hello " You }'
347 }
348 demo4worker() {
349     #Q11;  write down ane explain each part of the following regular expression
350     #      shown in 'Number'
351
352     gawk '  
353      BEGIN { FS=",";
354              Number = "^[+-]?([0-9]+[.]?[0-9]*|[.][0-9]+)([eE][+-]?[0-9]+)?$";
355            }
356           { Stats = $1 ~ Number ? "is" : "is not";
357              print "[" $1 "] " Stats " a number" }'
358 }
359 demo4() {
360     demo4data | demo4worker
361 }
362 demo5() {
363     demo4data | 
364     gawk '
365      BEGIN{ OFS=","}
366           { for(I=1;I <=NF;I++)  Freq[$I]++  }        
367      END  { for(N in Freq) print Freq[N], N }' |
368     sort -t, -n -k 1,1
369 }
370 demo6() {
371     demo4data | 
372     gawk '
373      BEGIN{ OFS=","}
374           { for(I=1;I <=NF;I++) {
375               All++;
376               Freq[$I]++ }
377      }    
378      END { print "#n,item,percent,graph"
379            for(N in Freq) {
380               Percent = int(100*Freq[N]/All)
381                print Freq[N],  N, Percent, str(Percent) }
382      }
383      function str(n, out) {
384        while(--n > 0) out = out "*";
385        return out
386      }' |
387     sort -t, -n -k 1,1
388 }
389 # modify demo9 such that if the first line containts "<" to left justify
390 demo7() {
391  gawk '
392  function top(a)        {return a[a[0]]}
393  function push(a,x,  i) {i=++a[0]; a[i]=x; return i}
394  function pop(a,   x,i) {
395     i=a[0]--;
396     if (!i) {return ""} else {x=a[i]; delete a[i]; return x}}
397
398  BEGIN {push(a,1); push(a,2); push(a,3);
399         while(x=pop(a)) print x
400        }'
401 }      
402 demo8() {
403     echo "---------------------"
404     demo4data | shuffle
405     echo "---------------------"
406     demo4data | shuffle
407 }
408 demo9() {
409     #Q12: what is different about demo9 to demo5. explain
410     #     in detail how that difference is implemented.
411     demo6 | malign
412 }
413 demo10() {
414     weather | classes | malign
415 }
416 demo11() {
417     echo -n "Instances="
418     weather | instances
419 }
420 demo12() {
421     #Q13: the 'instances' script of demo11 runs thru the whole file
422     #     but the 'numerics' script of demo12 stops after '@data'. why?
423
424     echo -n "Numerics="
425     weather | numerics
426 }
427 demo13() {
428     #Q14: why does this script change to $Tmp?
429     cd $Tmp
430     Bins=4
431     weather | someArff --bins $Bins --bin 1 
432     cat train.arff
433     echo ""
434     cat test.arff
435     cd $Here
436 }
437 demo14() {
438     cd $Tmp
439     Seed=$RANDOM
440     Bins=2
441     for((Bin=1;Bin<=$Bins;Bin++)); do
442         printf "\n---| $Bin |-------------------------\n\n"
443         weather | someArff --bins $Bins --bin $Bin --seed $Seed
444         cat train.arff
445         echo ""
446         cat test.arff
447     done
448     cd $Here
449 }
450 demo15() {
451     #Q15: what is going on here? what does 'median' do?
452     #     why is its output going to 'malign'?
453
454     demo14data | median | malign
455 }
456 demo16() {
457     # Q16: there is an 'escaped' local variable in this script.
458     #      (i.e. something that should have been declared local
459     #      but the silly programmer forgot to do that). Run
460     #     the script and use the output logs generated by 'Audit'
461     #     to find that sucker. Fix the program (declare one more local).
462
463     demo4data | 
464     $Audit --source '
465      BEGIN{ OFS=","}
466           { for(I=1;I <=NF;I++) {
467               All++;
468               Freq[$I]++ }
469      }    
470      END { print "#n,item,percent,graph"
471            for(N in Freq) {
472               Percent = int(100*Freq[N]/All)
473                print Freq[N],  N, Percent, str(Percent) }
474      }
475      function str(n) {
476        while(--n > 0) out = out "*";
477        return out
478      }' |
479     sort -t, -n -k 1,1
480 }
481 demo17() {
482     # Q17: adapt 'someArff' to 'firstArff'. This generates a train
483     #      and test set where train includes all of bins {1,2 .. (bin-1)}
484     #      and the test set contains bin {i}.
485     true
486 }
487 demo18() {
488     # Q18: adapt 'someArff' to 'nArff' that accepts a command line
489     #      argument "N" that returns an arff file containing up to
490     #      the first N instances.
491     true
492 }
493 #### start up
494 setUp
495 blabln "OurGawk version v0.1 (c)2007 tim@menzies.us under GPLv3"
496 echo "Tmp=$Tmp"
497 blabln "If you can get your job done with grep don't use SED."
498 blabln "If you can do it with SED don't use AWK."
499 blabln "If you can do it with AWK don't use 'C'.\n"