# example usage:
# MDPdata="x_product_module_metrics.csv"
# mdpModule2CSV  Log="10,20" Ignore="1,16,31" $MDPdata > x.csv
# java -cp weka.jar weka.core.converters.CSVLoader x.csv > x.arff
 
BEGIN {
  # Report class as Boolean?
  Boolean = 1;
  # Minimum value for log numbers
  LogMin = 0.000001;
  # Split lines on whitespace, with comments
  FS     = "[ \t]*,[ \t]*"; # <== REGULAR EXPRESSION: 
  # What is the target column?    
  Class  = 15;              # <== NUMBER
  # What columns NOT to output
  Ignore = "1,16,31";       # <== COMMA-SEPARATED NUMBERS
  # Which columns to log?
  Log    = ""               # <== COMMA-SEPARATED NUMBERS
  # Prefix of output file for columns
  Stem   = "";              # <== String
}       
NR==1{ # convert COMMA-SEPARATED NUMBERS to arrays of booleans
  s2b(Ignore, IgnoreS) 
  s2b(Log,     LogS);
}
function s2b(s,a,    i,tmp,n) {
  split("",a,"");
  n=0;
  n=split(s,tmp,",");
  for(i in tmp) a[tmp[i]]=1;
  return n;
}
function mylog(n,   min,tmp) { 
  min = LogMin;
  return log(n < min ? min : n);
}

      {sub(/,[ \t]*$/,"")} # hack: kill spaces
# handle line one
NR==1 {  
  Str=Sep="";
  for(I=1;I<=NF;I++) {
    if (IgnoreS[I]) continue;
    if (Class == I) continue;
    Val = $I;
    Val = (Val ~ /^[ \t]*$/) ? "?" : Val;
    Val = LogS[I] ? "L=" Val "="   : Val;
    Str = Str Sep Val;
    Sep = ",";
    Name[I] = Val;
  }
  print Str, ",class"
  next
}

# handle remaining lines
{ 
  Str=Sep="";
  for(I=1;I<=NF;I++) {
    if (IgnoreS[I]) continue;
    if (Class == I) continue;
    Val = $I;    
    Val = (Val ~ /^\./)      ? "0" Val : Val;
    Val = (Val ~ /^[ \t]*$/) ? "?"     : Val;
    if (Val != "?" )
      Val = LogS[I] ? mylog(Val) : Val;
    Str = Str Sep Val;
    Sep = ",";
  }
  End = $Class
  if (Boolean) 
		End = $Class > 0 ? "yes" : "no" #<= add class symbol
  print Str ","  End
}