#!/usr/bin/gawk -f #Min and Max are only used when a histogram or graph is going to be generated. They are only for explanation purposes and have no other use. #Given dist (and its flag) is meaningless for internal (non-leaf) nodes. Their distributions must be generated from their child nodes' distsributions. #Goal dist (and its flags) is meaningless for leaf nodes. BEGIN { FS = OFS = ","; IGNORECASE = 1; Seed = 1; TotalBins = 20; TotalRuns = 100; Factor = 1; # Percentage of nodes to obtain random distributions IsExperimentalData = 1; } /^$/ || /\#.*/ { next; } { UniqueKeyIndex = 1; # This is an unique key for each node used to connect nodes in the Bayesian Network data file to nodes in the graphical data file. NameIndex = 2; OperationIndex = 3; GoalPriorityIndex = 4; MinIndex = 5; MaxIndex = 6; GivenDistFlagIndex = 7; IsGivenDistIndex = 8; IsGoalDistIndex = 9; ChildrenCountIndex = 10; SanityCheckUniqueNodeName[$NameIndex]++; TotalNodes++; Index[$NameIndex] = TotalNodes; UniqueKey[TotalNodes] = $UniqueKeyIndex; Name[TotalNodes] = $NameIndex; Operation[TotalNodes] = $OperationIndex; GoalPriority[TotalNodes] = $GoalPriorityIndex; Min[TotalNodes] = $MinIndex; Max[TotalNodes] = $MaxIndex; GivenDistFlag[TotalNodes] = $GivenDistFlagIndex; IsGivenDist[TotalNodes] = $IsGivenDistIndex; IsGoalDist[TotalNodes] = $IsGoalDistIndex; ChildrenCount[TotalNodes] = $ChildrenCountIndex; for (i = 1; i <= ChildrenCount[TotalNodes]; i++) Children[TotalNodes,i] = $(i+ChildrenCountIndex); GivenDistStartIndex = ChildrenCountIndex + ChildrenCount[TotalNodes]; if (IsGivenDist[TotalNodes] == 1) { for (i = 1; i <= TotalBins; i++) GivenDist[TotalNodes,i] = $(GivenDistStartIndex+i); GoalDistStartIndex = GivenDistStartIndex + TotalBins; } else GoalDistStartIndex = GivenDistStartIndex; #keep the goal dist normalized. This is not needed for given dist since it is normalized as needed. if (IsGoalDist[TotalNodes] == 1) { for (i = 1; i <= TotalBins; i++) tempDistArray[i] = $(GoalDistStartIndex+i); normalizeDist(tempDistArray); for (i = 1; i <= TotalBins; i++) GoalDist[TotalNodes,i] = tempDistArray[i]; } } END { srand(Seed); #this is where the code starts normally if (sanityCheck() == -1) { print "Error! One or more of the child nodes are missing OR a node is defined recursively OR a node is defined more than once."; exit; } infinity = 10^20; AllRoots = 0; findSubgraphs(); #NOTE: This is only needed in experimental settings and should not be used when user data is the input. if (IsExperimentalData == 1) generateRandomGoalAndGivenDist(int(TotalNodes*Factor/100)); #sort the roots based on their priority for (rootCounter = 1; rootCounter <= AllRoots; rootCounter++) { rootPriority[rootCounter,1] = rootCounter; rootPriority[rootCounter,2] = GoalPriority[SubGraph[rootCounter,0]]; } for (i = 1; i <= AllRoots; i++) { tempRootIndex = rootPriority[i,1]; tempPriority = rootPriority[i,2]; j = i; while ((j > 1) && (rootPriority[j-1,2] < tempPriority)) { rootPriority[j,1] = rootPriority[j-1,1]; rootPriority[j,2] = rootPriority[j-1,2]; j = j - 1; } rootPriority[j,1] = tempRootIndex; rootPriority[j,2] = tempPriority; } for (indexCounter = 1; indexCounter <= AllRoots; indexCounter++) { #use roots from highest to lowest priority rootCounter = rootPriority[indexCounter,1]; TotalSubGraphNodes = 0; for (nodeCounter = 1; nodeCounter <= TotalNodes; nodeCounter++) if (SubGraph[rootCounter,nodeCounter] == 1) TotalSubGraphNodes++; HighestPriorityNode = findMaxPriorityNode(rootCounter); if (HighestPriorityNode == -1 || GoalPriority[rootCounter] == GoalPriority[HighestPriorityNode]) HighestPriorityNode = rootCounter; for (binCounter = 1; binCounter <= TotalBins; binCounter++) { for (nodeCounter = 1; nodeCounter <= TotalNodes; nodeCounter++) { if (ProcessedFinal[nodeCounter] == 0) BestBin[nodeCounter,binCounter] = 0; MinScore[binCounter] = infinity; } } for (run = 1; run <= TotalRuns; run++) { #initialize (possibly old values) for a new run for (tempCounter = 0; tempCounter <= TotalNodes; tempCounter++) Processed[tempCounter] = 0; #process the leaves of the subgraph first for (leafCounter = 1; leafCounter <= TotalNodes; leafCounter++) { if (ChildrenCount[leafCounter] == 0 && SubGraph[rootCounter,leafCounter] == 1 && Processed[leafCounter] == 0) { if (ProcessedFinal[leafCounter] == 0) processLeafNode(leafCounter); Processed[leafCounter] = 1; Processed[0]++; } } #process the rest (all internal nodes including the root) for the subgraph while (Processed[0] < TotalSubGraphNodes) { for (internalCounter = 1; internalCounter <= TotalNodes; internalCounter++) { if (Processed[internalCounter] == 0 && SubGraph[rootCounter,internalCounter] == 1) { processedChildren = 0; for (childCounter = 1; childCounter <= ChildrenCount[internalCounter]; childCounter++) { if (Processed[Index[Children[internalCounter,childCounter]]] == 1) processedChildren++; } #if all of its children are processed, process it then if (processedChildren == ChildrenCount[internalCounter]) { if (ProcessedFinal[internalCounter] == 0) processInternalNode(internalCounter); Processed[internalCounter] = 1; Processed[0]++; } } } } score(rootCounter); } #After each root is completely processed, save the current nodes' information since the shared nodes can only be calculated once (when seen first) for (nodeCounter = 1; nodeCounter <= TotalNodes; nodeCounter++) { if (SubGraph[rootCounter,nodeCounter] == 1) { ProcessedFinal[nodeCounter] = 1; for (tempBinCounter = 1; tempBinCounter <= TotalBins; tempBinCounter++) FinalDistArray[Name[nodeCounter],tempBinCounter] = BestBin[nodeCounter,tempBinCounter]; } } #reportSubgraph(rootCounter); } #print all the results for (nodeCounter = 1; nodeCounter <= TotalNodes; nodeCounter++) { printf("%s,%s", UniqueKey[nodeCounter], Name[nodeCounter]); for (binCounter = 1; binCounter <= TotalBins; binCounter++) printf(",%.3f", BestBin[nodeCounter,binCounter]); if (IsGoalDist[nodeCounter] == 1) { for (binCounter = 1; binCounter <= TotalBins; binCounter++) printf(",%.3f", GoalDist[nodeCounter,binCounter]); } printf("\n"); } } function findMaxPriorityNode(root,tempMax,tempNode) { tempMax = 0; tempNode = -1; for (nodeCounter = 1; nodeCounter <= TotalNodes; nodeCounter++) { if (GoalPriority[nodeCounter] >= tempMax && SubGraph[root,nodeCounter] == 1) { tempMax = GoalPriority[nodeCounter]; tempNode = nodeCounter; } } return tempNode; } function findSubgraphs(tempArray) { #find all the roots as the ones that are not a child of any other node for (nodeCounter = 1; nodeCounter <= TotalNodes; nodeCounter++) for (tempCounter = 1; tempCounter <= ChildrenCount[nodeCounter]; tempCounter++) tempArray[Index[Children[nodeCounter,tempCounter]]]++; for (nodeCounter = 1; nodeCounter <= TotalNodes; nodeCounter++) { if (tempArray[nodeCounter] == 0) { #a root has been found AllRoots++; SubGraph[AllRoots,0] = nodeCounter; } } for (rootCounter = 1; rootCounter <= AllRoots; rootCounter++) { SubGraph[rootCounter,SubGraph[rootCounter,0]] = 1; findAllChildren(SubGraph[rootCounter,0],rootCounter); } } function findAllChildren(node,root,childCounter) { for (childCounter = 1; childCounter <= ChildrenCount[node]; childCounter++) { SubGraph[root,Index[Children[node,childCounter]]] = 1; findAllChildren(Index[Children[node,childCounter]],root); } } function sanityCheck() { for (nodeCounter = 1; nodeCounter <= TotalNodes; nodeCounter++) for (tempCounter = 1; tempCounter <= ChildrenCount[nodeCounter]; tempCounter++) if (Index[Children[nodeCounter,tempCounter]] == 0 || Index[Children[nodeCounter,tempCounter]] == Index[Name[nodeCounter]]) return -1; for (element in SanityCheckUniqueNodeName) if (SanityCheckUniqueNodeName[element] > 1) return -1; return 1; } function score(root,tempCounter,tempBinCounter,tempValue,tempScore,tempArray) { for (tempBinCounter = 1; tempBinCounter <= TotalBins; tempBinCounter++) tempScore[tempBinCounter] = 0; for (tempCounter = 1; tempCounter <= TotalNodes; tempCounter++) { if (IsGoalDist[tempCounter] == 1 && SubGraph[root,tempCounter] == 1) # && Layer[tempCounter] == Layer[SubGraph[root,0]]) { for (tempBinCounter = 1; tempBinCounter <= TotalBins; tempBinCounter++) { tempValue = GoalDist[tempCounter,tempBinCounter] - FinalDistArray[Name[tempCounter],tempBinCounter]; if (tempValue < 0) tempValue *= -1; tempScore[tempBinCounter] = tempScore[tempBinCounter] + tempValue * GoalPriority[tempCounter]; } } } for (tempBinCounter = 1; tempBinCounter <= TotalBins; tempBinCounter++) { if (tempScore[tempBinCounter] < MinScore[tempBinCounter]) { MinScore[tempBinCounter] = tempScore[tempBinCounter]; for (tempCounter = 1; tempCounter <= TotalNodes; tempCounter++) if (SubGraph[root,tempCounter] == 1) BestBin[tempCounter,tempBinCounter] = FinalDistArray[Name[tempCounter],tempBinCounter]; } } #normalize the BestBin for (tempCounter = 1; tempCounter <= TotalNodes; tempCounter++) { if (SubGraph[root,tempCounter] == 1) { for (binCounter = 1; binCounter <= TotalBins; binCounter++) tempArray[binCounter] = BestBin[tempCounter,binCounter]; normalizeDist(tempArray); for (binCounter = 1; binCounter <= TotalBins; binCounter++) BestBin[tempCounter,binCounter] = tempArray[binCounter]; } } } function processInternalNode(nodeIndex,tempMinCount,tempMaxCount,tempCounter,tempBinCounter,tempOutArray) { #Having a flag for internal nodes is meaningless since we have to be able to use distributions from child nodes. for (tempBinCounter = 1; tempBinCounter <= TotalBins; tempBinCounter++) { tempMaxCount[tempBinCounter] = -infinity; tempMinCount[tempBinCounter] = infinity; } for (tempCounter = 1; tempCounter <= ChildrenCount[nodeIndex]; tempCounter++) { childName = Children[nodeIndex,tempCounter]; for (tempBinCounter = 1; tempBinCounter <= TotalBins; tempBinCounter++) { if (tempMaxCount[tempBinCounter] < FinalDistArray[childName,tempBinCounter]) tempMaxCount[tempBinCounter] = FinalDistArray[childName,tempBinCounter]; if (tempMinCount[tempBinCounter] > FinalDistArray[childName,tempBinCounter]) tempMinCount[tempBinCounter] = FinalDistArray[childName,tempBinCounter]; } } if (Operation[nodeIndex] == "null") { childName = Children[nodeIndex,1]; for (tempCounter = 1; tempCounter <= TotalBins; tempCounter++) tempOutArray[tempCounter] = FinalDistArray[childName,tempCounter]; } else if (Operation[nodeIndex] == "not") { childName = Children[nodeIndex,1]; for (tempCounter = 1; tempCounter <= TotalBins; tempCounter++) tempOutArray[tempCounter] = 1 - FinalDistArray[childName,tempCounter]; } else if (Operation[nodeIndex] == "and") { for (tempCounter = 1; tempCounter <= TotalBins; tempCounter++) tempOutArray[tempCounter] = tempMinCount[tempCounter]; } else if (Operation[nodeIndex] == "or") { for (tempCounter = 1; tempCounter <= TotalBins; tempCounter++) tempOutArray[tempCounter] = tempMaxCount[tempCounter]; } normalizeDist(tempOutArray); for (tempBinCounter = 1; tempBinCounter <= TotalBins; tempBinCounter++) FinalDistArray[Name[nodeIndex],tempBinCounter] = tempOutArray[tempBinCounter]; } function processLeafNode(nodeIndex,tempBinCounter,tempOutArray) { if (IsGivenDist[nodeIndex] == 1 && (GivenDistFlag[nodeIndex] == "fixed" || run == 1)) getDist(nodeIndex,tempOutArray); else generateDist(nodeIndex,tempOutArray); normalizeDist(tempOutArray); for (tempBinCounter = 1; tempBinCounter <= TotalBins; tempBinCounter++) FinalDistArray[Name[nodeIndex],tempBinCounter] = tempOutArray[tempBinCounter]; } function getDist(nodeIndex,out,tempBinCounter) { for (tempBinCounter = 1; tempBinCounter <= TotalBins; tempBinCounter++) out[tempBinCounter] = GivenDist[nodeIndex,tempBinCounter]; } function generateDist(nodeIndex,out,tempBinCounter) { for (tempBinCounter = 1; tempBinCounter <= TotalBins; tempBinCounter++) { if (rand() > 0.5 && run > 1) out[tempBinCounter] = BestBin[nodeIndex,tempBinCounter]; else { bound = GoalDist[HighestPriorityNode,tempBinCounter] * 2; out[tempBinCounter] = rand() * bound; } } } function normalizeDist(inputArray,tempCounter,sum) { sum = 0; for (tempCounter = 1; tempCounter <= TotalBins; tempCounter++) sum += inputArray[tempCounter]; if (sum != 0) for (tempCounter = 1; tempCounter <= TotalBins; tempCounter++) inputArray[tempCounter] = inputArray[tempCounter] / sum; } function reportSubgraph(root,tempArray) { print "processing subgraph with root "Name[SubGraph[root,0]]; for (j = 1; j <= TotalNodes; j++) { if (SubGraph[root,j] == 1) { for (i = 1; i <= TotalBins; i++) printf("%.2f,",BestBin[j,i]); printf (" %s",Name[j]); if (ChildrenCount[j] > 0 ) { printf(" --> %s (",Operation[j]); comma = ""; for (i = 1; i <= ChildrenCount[j]; i++) { printf("%s",comma); printf("%s", Children[j,i]); comma = "," } printf (")"); } print ":"GivenDistFlag[j]; } } print ""; for (j = 1; j <= TotalNodes; j++) { if (SubGraph[root,j] == 1) { sum = 0; if (IsGoalDist[j] == 1) { for (i = 1; i <= TotalBins; i++) { tempValue = BestBin[j,i] - GoalDist[j,i]; if (tempValue < 0) tempValue *= -1; sum += tempValue; } for (i = 1; i <= TotalBins; i++) printf("%.2f,",BestBin[j,i]); print " --> PREDICTED " Name[j] " ( Priority: " GoalPriority[j] " and Error: " sum*100 "% )"; for (i = 1; i <= TotalBins; i++) printf("%.2f,",GoalDist[j,i]); print " --> GOAL " Name[j]; } } } } function generateRandomGoalAndGivenDist(limit) { count = 0; while (count < limit) { for (nodeCounter = 1; nodeCounter <= TotalNodes; nodeCounter++) { #randomly generate goal distributions if (rand() < 0.5 && IsGoalDist[nodeCounter] == 0 && count < limit && ChildrenCount[nodeCounter] > 0) { for (binCounter = 1; binCounter <= TotalBins; binCounter++) tempArray[binCounter] = int(rand() * TotalBins); normalizeDist(tempArray); for (binCounter = 1; binCounter <= TotalBins; binCounter++) GoalDist[nodeCounter,binCounter] = tempArray[binCounter]; IsGoalDist[nodeCounter] = 1; count++; GoalPriority[nodeCounter] = int(rand() * 100); } #randomly generate given distributions and their flags if (rand() < 0.5 && IsGivenDist[nodeCounter] == 0 && count < limit && ChildrenCount[nodeCounter] == 0) { for (binCounter = 1; binCounter <= TotalBins; binCounter++) tempArray[binCounter] = int(rand() * TotalBins); normalizeDist(tempArray); for (binCounter = 1; binCounter <= TotalBins; binCounter++) GivenDist[nodeCounter,binCounter] = tempArray[binCounter]; IsGivenDist[nodeCounter] = 1; count++; if (rand() < 0.5) GivenDistFlag[nodeCounter] = "opt"; else GivenDistFlag[nodeCounter] = "fixed"; } } } #this block is used to make sure roots have a random goal distribution for (rootCounter = 1; rootCounter <= AllRoots; rootCounter++) { nodeCounter = SubGraph[rootCounter,0]; if (IsGoalDist[nodeCounter] == 0) { for (binCounter = 1; binCounter <= TotalBins; binCounter++) tempArray[binCounter] = int(rand() * TotalBins); normalizeDist(tempArray); for (binCounter = 1; binCounter <= TotalBins; binCounter++) GoalDist[nodeCounter,binCounter] = tempArray[binCounter]; IsGoalDist[nodeCounter] = 1; GoalPriority[nodeCounter] = int(rand() * 100); } } }