#!/usr/bin/gawk -f BEGIN { TrainSubset="trainSubset.arff"; TrainSuperset="trainSuperset.arff"; TestSubset="test.arff"; TestSize = 5; Seed = 1; } /^[ \t]*$/ { next; } NR==1{ Seed ? srand(Seed) : srand(1) printf "" > TrainSubset; printf "" > TrainSuperset; printf "" > TestSubset; } FILENAME==ARGV[3] && /@relation/,/@data/ { print $0 >> TrainSubset; print $0 >> TrainSuperset; print $0 >> TestSubset; next; } FILENAME==ARGV[4] && /@relation/,/@data/ { next; } FILENAME==ARGV[3] { InstanceSubset[rand()] = $0; } FILENAME==ARGV[4] { InstanceSuperset[rand()] = $0; } END { for (i in InstanceSubset) { if (TestSize > 0) { #store the ones for test file and print them to the test file as well TestArray[TestSize] = InstanceSubset[i]; print InstanceSubset[i] >> TestSubset; TestSize--; } else { #print the rest to the train file made from the subset only print InstanceSubset[i] >> TrainSubset; } } for (i in InstanceSuperset) { UsedBefore = 0; #check to see if that instance is in the test file. If not, it can be in the train file made from the superset minus the instances in test file for (j in TestArray) { if (InstanceSuperset[i] == TestArray[j]) { UsedBefore = 1; } } #if not used for test file, it can be used for training if (UsedBefore == 0) print InstanceSuperset[i] >> TrainSuperset; } }