# example usage: # MDPdata="x_product_module_metrics.csv" # mdpModule2CSV Log="10,20" Ignore="1,16,31" $MDPdata > x.csv # java -cp weka.jar weka.core.converters.CSVLoader x.csv > x.arff BEGIN { # Report class as Boolean? Boolean = 1; # Minimum value for log numbers LogMin = 0.000001; # Split lines on whitespace, with comments FS = "[ \t]*,[ \t]*"; # <== REGULAR EXPRESSION: # What is the target column? Class = 15; # <== NUMBER # What columns NOT to output Ignore = "1,16,31"; # <== COMMA-SEPARATED NUMBERS # Which columns to log? Log = "" # <== COMMA-SEPARATED NUMBERS # Prefix of output file for columns Stem = ""; # <== String } NR==1{ # convert COMMA-SEPARATED NUMBERS to arrays of booleans s2b(Ignore, IgnoreS) s2b(Log, LogS); } function s2b(s,a, i,tmp,n) { split("",a,""); n=0; n=split(s,tmp,","); for(i in tmp) a[tmp[i]]=1; return n; } function mylog(n, min,tmp) { min = LogMin; return log(n < min ? min : n); } {sub(/,[ \t]*$/,"")} # hack: kill spaces # handle line one NR==1 { Str=Sep=""; for(I=1;I<=NF;I++) { if (IgnoreS[I]) continue; if (Class == I) continue; Val = $I; Val = (Val ~ /^[ \t]*$/) ? "?" : Val; Val = LogS[I] ? "L=" Val "=" : Val; Str = Str Sep Val; Sep = ","; Name[I] = Val; } print Str, ",class" next } # handle remaining lines { Str=Sep=""; for(I=1;I<=NF;I++) { if (IgnoreS[I]) continue; if (Class == I) continue; Val = $I; Val = (Val ~ /^\./) ? "0" Val : Val; Val = (Val ~ /^[ \t]*$/) ? "?" : Val; if (Val != "?" ) Val = LogS[I] ? mylog(Val) : Val; Str = Str Sep Val; Sep = ","; } End = $Class if (Boolean) End = $Class > 0 ? "yes" : "no" #<= add class symbol print Str "," End }