#!/usr/bin/gawk -f

BEGIN  { 
	TrainSubset="trainSubset.arff";
	TrainSuperset="trainSuperset.arff";
	TestSubset="test.arff"; 
	TestSize = 5;
	Seed = 1;
} 

/^[ \t]*$/ {
	next;
} 

NR==1{
	Seed ? srand(Seed) : srand(1)      
	printf "" > TrainSubset;  
	printf "" > TrainSuperset;
	printf "" > TestSubset;
}

FILENAME==ARGV[3] && /@relation/,/@data/ { 
	print $0 >> TrainSubset;
	print $0 >> TrainSuperset;
	print $0 >> TestSubset; 
	next; 
}

FILENAME==ARGV[4] && /@relation/,/@data/ { 
	next; 
}

FILENAME==ARGV[3] {
	InstanceSubset[rand()] = $0;
}

FILENAME==ARGV[4] {
	InstanceSuperset[rand()] = $0;
}

END { 
	for (i in InstanceSubset)
	{
		if (TestSize > 0)
		{
			#store the ones for test file and print them to the test file as well
			TestArray[TestSize] = InstanceSubset[i];
			print InstanceSubset[i] >> TestSubset;
			TestSize--;
		}

		else
		{
			#print the rest to the train file made from the subset only
			print InstanceSubset[i] >> TrainSubset;
		}
	}

	for (i in InstanceSuperset)
	{
		UsedBefore = 0;
		#check to see if that instance is in the test file. If not, it can be in the train file made from the superset minus the instances in test file
		for (j in TestArray)
		{
			if (InstanceSuperset[i] == TestArray[j]) 
			{
				UsedBefore = 1;
			}
		}
		#if not used for test file, it can be used for training	
		if (UsedBefore == 0)
			print InstanceSuperset[i] >> TrainSuperset;
	}
}