#!/usr/bin/gawk -f # /* vim: set filetype=awk : */ -*- awk -*- ### patterns0: initialization stuff BEGIN{ Stem="data"; IGNORECASE=1; Atts=""; Class=""; #The following is the configuration used for tar3 Config="granularity: 5\n" \ "maxNumber: 10\n" \ "maxSize: 4\n" \ "randomTrials: 50\n" \ "futileTrials: 5\n" \ "bestClass: 33%"; Dat=""; } NR==1 { Cfg = Stem ".cfg"; DataOut = Stem ".data"; Names = Stem ".names"; } ### patterns1: skip blanks and comments {sub(/\%.*/,"")} /^[ \t]*$/ {next} ### patterns2: react to arff keywords /@relation/ { Header=1; Data=0; Attr=0; } /@data/ { Header=0; Data=1; } {Real=(/numeric/ || /real/ || /integer/);} ### pattern3: adding the continuous attributes onto the list of attributes /@attribute/ && Real==1{ Atts= Atts "\n" $2 ": continuous"; } ### pattern4: adding the discrete attributes onto the list of attributes /@attribute/ && Real==0 && $2!~/class/{ temp= substr($0, index($0,"{"),index($0,"}")-index($0,"{"));#getting the discrete values gsub(/ *\, */,",",temp);#removing all the extra spaces around the commas sub(/\{ */,"",temp);#removing the leading "{" with all spaces after it Atts= Atts "\n" $2 ": " temp; } ### pattern4: getting the discrete classes /@attribute/ && $2~/class/ { Class= substr($0, index($0,"{"),index($0,"}")-index($0,"{"));#getting the discrete values gsub(/ *\, */,",",Class);#removing all the extra spaces around the commas sub(/\{ */,"",Class);#removing the leading "{" with all spaces after it } ### pattern5: appending the data lines to the output /[^/@data/]/ && Data { Dat= Dat $0 "\n"; } ### pattern6: outputting the data into the appropriate files END { print Class "\n" Atts>Names; print Config>Cfg; print Dat>DataOut; }