#!/bin/bash gather() { for data in $1*.csv do stem=${data/\.*/} gawk 'BEGIN{FS=","; OFS="\t"} {print $0,Data}' Data=$stem $data; done } matrix() { gawk 'BEGIN{FS="\t"; OFS=",";} { Data[$2,++Data[$2,0]] = $1; Subset[$2] = 1; } END{ for (key in Subset) SubsetOrder[++keyCount] = key; asort(SubsetOrder); for (k1 = 1; k1 <= keyCount; k1++) { key1 = SubsetOrder[k1]; for (k2 = k1 + 1; k2 <= keyCount; k2++) { key2 = SubsetOrder[k2]; if (array[key1,key2] == 0 && array[key2,key1] == 0) { array[key1,key2] = 1; array[key2,key1] = 1; for (i = 1; i <= Data[key1,0]; i++) { for (j = 1; j <= Data[key2,0]; j++) { if (Data[key1,i] == Data[key2,j]) Shared[key1,key2]++; } } } } } for (k1 = 1; k1 <= keyCount; k1++) { printf(",%s", SubsetOrder[k1]); } print ""; for (k1 = 1; k1 <= keyCount; k1++) { key1 = SubsetOrder[k1]; printf("%s,", key1); for (k2 = 1; k2 <= keyCount; k2++) { key2 = SubsetOrder[k2]; if (tempArray[key1,key2] == 0 && tempArray[key2,key1] == 0 && key1 != key2) { tempArray[key1,key2] = 1; tempArray[key2,key1] = 1; printf("%s/%s,", Shared[key1,key2]+0,Data[key1,0] + Data[key2,0] - Shared[key1,key2]); } else printf(","); } printf("\n"); } } ' $1; } gather "coc81" | matrix | malign > coc81Matrix; gather "nasa93" | matrix | malign > nasa93Matrix;