Enable UT sendrecv to same rank. Fixes test failure.
Backports commit: https://github.com/ROCm/rccl/commit/fd9924cfe7afbb94b1f157972ba001865481480a
--- a/test/SendRecvTests.cpp
+++ b/test/SendRecvTests.cpp
@@ -16,7 +16,6 @@ namespace RcclUnitTesting
     std::vector<int>            const  numElements     = {1048576, 53327, 1024, 0};
     bool                        const  inPlace         = false;
     bool                        const  useManagedMem   = false;
-    int                         const  groupCallId     = 0;
 
     OptionalColArgs options;
     bool isCorrect = true;
@@ -28,7 +27,10 @@ namespace RcclUnitTesting
       int ranksPerGpu = rpg == 0 ? 1 : testBed.ev.maxRanksPerGpu;
       int totalRanks = numGpus * ranksPerGpu;
       int const numProcesses = isMultiProcess ? numGpus : 1;
-      testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), 1);
+      testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu),
+                        {1,2}, //two group, second group sendrecv to self, has 2 coll
+                        testBed.GetNumStreamsPerGroup(1,2),
+                        2);
 
       for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
       for (int numIdx = 0; numIdx < numElements.size() && isCorrect; ++numIdx)
@@ -37,6 +39,8 @@ namespace RcclUnitTesting
         for (int recvRank = 0; recvRank  < totalRanks; ++recvRank)
         {
           options.root = recvRank;
+          int groupCallId = sendRank == recvRank; //self sendrecv group has two coll
+          int recvId      = sendRank == recvRank; //where recv will be second coll
           testBed.SetCollectiveArgs(ncclCollSend,
                                     dataTypes[dataIdx],
                                     numElements[numIdx],
@@ -47,36 +51,46 @@ namespace RcclUnitTesting
                                     sendRank);
           if (recvRank == 0)
           {
-            testBed.AllocateMem(inPlace, useManagedMem, groupCallId, 0, sendRank);
-            testBed.PrepareData(groupCallId, 0, sendRank);
-          }
-          if (recvRank  != sendRank)
-          {
-            if (testBed.ev.showNames) // Show test names
-              INFO("%s Datatype: %s SendReceive test Rank %d -> Rank %d for %d Elements\n",
-                  isMultiProcess ? "MP" : "SP",
-                  ncclDataTypeNames[dataTypes[dataIdx]],
-                  sendRank,
-                  recvRank,
-                  numElements[numIdx]);
-
-            options.root = sendRank;
-            testBed.SetCollectiveArgs(ncclCollRecv,
+            //set up the collArg slot to make sure AllocateMem is called once and correctly
+            testBed.SetCollectiveArgs(ncclCollSend,
                                       dataTypes[dataIdx],
                                       numElements[numIdx],
                                       numElements[numIdx],
                                       options,
                                       0,
-                                      groupCallId,
-                                      recvRank);
-            testBed.AllocateMem(inPlace, useManagedMem, groupCallId, 0, recvRank);
-            testBed.PrepareData(groupCallId, 0, recvRank);
-            testBed.ExecuteCollectives({sendRank, recvRank});
-            testBed.ValidateResults(isCorrect, groupCallId, 0, recvRank);
-            testBed.DeallocateMem(groupCallId, 0, recvRank);
+                                      !groupCallId,
+                                      sendRank);
+            testBed.AllocateMem(inPlace, useManagedMem, 0, 0, sendRank);
+            testBed.PrepareData(0, 0, sendRank);
+            testBed.AllocateMem(inPlace, useManagedMem, 1, 0, sendRank);
+            testBed.PrepareData(1, 0, sendRank);
           }
+
+          if (testBed.ev.showNames) // Show test names
+            INFO("%s Datatype: %s SendReceive test Rank %d -> Rank %d for %d Elements\n",
+                 isMultiProcess ? "MP" : "SP",
+                 ncclDataTypeNames[dataTypes[dataIdx]],
+                 sendRank,
+                 recvRank,
+                 numElements[numIdx]);
+          options.root = sendRank;
+
+          testBed.SetCollectiveArgs(ncclCollRecv,
+                                    dataTypes[dataIdx],
+                                    numElements[numIdx],
+                                    numElements[numIdx],
+                                    options,
+                                    recvId,
+                                    groupCallId,
+                                    recvRank);
+          testBed.AllocateMem(inPlace, useManagedMem, groupCallId, recvId, recvRank);
+          testBed.PrepareData(groupCallId, recvId, recvRank);
+          testBed.ExecuteCollectives({sendRank, recvRank}, groupCallId);
+          testBed.ValidateResults(isCorrect, groupCallId, recvId, recvRank);
+          testBed.DeallocateMem(groupCallId, recvId, recvRank);
         }
-        testBed.DeallocateMem(groupCallId, 0, sendRank);
+        testBed.DeallocateMem(0, 0, sendRank);
+        testBed.DeallocateMem(1, 0, sendRank);
       }
       testBed.DestroyComms();
     }
@@ -94,7 +108,6 @@ namespace RcclUnitTesting
     bool                        const  inPlace         = false;
     bool                        const  useManagedMem   = false;
     bool                        const  userRegistered  = true;
-    int                         const  groupCallId     = 0;
 
     OptionalColArgs options;
     bool isCorrect = true;
@@ -106,7 +119,10 @@ namespace RcclUnitTesting
       int ranksPerGpu = rpg == 0 ? 1 : testBed.ev.maxRanksPerGpu;
       int totalRanks = numGpus * ranksPerGpu;
       int const numProcesses = isMultiProcess ? numGpus : 1;
-      testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), 1);
+      testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu),
+                        {1,2}, //two group, second group sendrecv to self, has 2 coll
+                        testBed.GetNumStreamsPerGroup(1,2),
+                        2);
 
       for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
       for (int numIdx = 0; numIdx < numElements.size() && isCorrect; ++numIdx)
@@ -115,6 +131,8 @@ namespace RcclUnitTesting
         for (int recvRank = 0; recvRank  < totalRanks; ++recvRank)
         {
           options.root = recvRank;
+          int groupCallId = sendRank == recvRank;
+          int recvId      = sendRank == recvRank;
           testBed.SetCollectiveArgs(ncclCollSend,
                                     dataTypes[dataIdx],
                                     numElements[numIdx],
@@ -125,36 +143,45 @@ namespace RcclUnitTesting
                                     sendRank);
           if (recvRank == 0)
           {
-            testBed.AllocateMem(inPlace, useManagedMem, groupCallId, 0, sendRank, userRegistered);
-            testBed.PrepareData(groupCallId, 0, sendRank);
-          }
-          if (recvRank  != sendRank)
-          {
-            if (testBed.ev.showNames) // Show test names
-              INFO("%s Datatype: %s SendReceive test Rank %d -> Rank %d for %d Elements\n",
-                  isMultiProcess ? "MP" : "SP",
-                  ncclDataTypeNames[dataTypes[dataIdx]],
-                  sendRank,
-                  recvRank,
-                  numElements[numIdx]);
-
-            options.root = sendRank;
-            testBed.SetCollectiveArgs(ncclCollRecv,
+            testBed.SetCollectiveArgs(ncclCollSend,
                                       dataTypes[dataIdx],
                                       numElements[numIdx],
                                       numElements[numIdx],
                                       options,
                                       0,
-                                      groupCallId,
-                                      recvRank);
-            testBed.AllocateMem(inPlace, useManagedMem, groupCallId, 0, recvRank, userRegistered);
-            testBed.PrepareData(groupCallId, 0, recvRank);
-            testBed.ExecuteCollectives({sendRank, recvRank});
-            testBed.ValidateResults(isCorrect, groupCallId, 0, recvRank);
-            testBed.DeallocateMem(groupCallId, 0, recvRank);
+                                      !groupCallId,
+                                      sendRank);
+            testBed.AllocateMem(inPlace, useManagedMem, 0, 0, sendRank, userRegistered);
+            testBed.PrepareData(0, 0, sendRank);
+            testBed.AllocateMem(inPlace, useManagedMem, 1, 0, sendRank, userRegistered);
+            testBed.PrepareData(1, 0, sendRank);
           }
+
+          if (testBed.ev.showNames) // Show test names
+            INFO("%s Datatype: %s SendReceive test Rank %d -> Rank %d for %d Elements\n",
+                 isMultiProcess ? "MP" : "SP",
+                 ncclDataTypeNames[dataTypes[dataIdx]],
+                 sendRank,
+                 recvRank,
+                 numElements[numIdx]);
+
+          options.root = sendRank;
+          testBed.SetCollectiveArgs(ncclCollRecv,
+                                    dataTypes[dataIdx],
+                                    numElements[numIdx],
+                                    numElements[numIdx],
+                                    options,
+                                    recvId,
+                                    groupCallId,
+                                    recvRank);
+          testBed.AllocateMem(inPlace, useManagedMem, groupCallId, recvId, recvRank, userRegistered);
+          testBed.PrepareData(groupCallId, recvId, recvRank);
+          testBed.ExecuteCollectives({sendRank, recvRank}, groupCallId);
+          testBed.ValidateResults(isCorrect, groupCallId, recvId, recvRank);
+          testBed.DeallocateMem(groupCallId, recvId, recvRank);
         }
-        testBed.DeallocateMem(groupCallId, 0, sendRank);
+        testBed.DeallocateMem(0, 0, sendRank);
+        testBed.DeallocateMem(1, 0, sendRank);
       }
       testBed.DestroyComms();
     }
--- a/test/common/TestBedChild.cpp
+++ b/test/common/TestBedChild.cpp
@@ -395,6 +395,8 @@ namespace RcclUnitTesting
       {
         CollectiveArgs& collArg = this->collArgs[groupId][localRank][collIdx];
         CHECK_CALL(collArg.AllocateMem(inPlace, useManagedMem, userRegistered));
+        if (collArg.userRegistered && (collArg.funcType == ncclCollSend || collArg.funcType == ncclCollRecv))
+          CHILD_NCCL_CALL(ncclCommRegister(this->comms[localRank], collArg.inputGpu.ptr, collArg.numInputBytesAllocated, &(collArg.commRegHandle)),"ncclCommRegister");
         if (this->verbose) INFO("Rank %d on child %d allocates memory for collective %d in group %d on device %d (%s,%s,%s) Input: %p Output %p\n",
                                 globalRank, this->childId, collIdx, groupId, this->deviceIds[localRank],
                                 inPlace ? "in-place" : "out-of-place",
@@ -646,8 +648,6 @@ namespace RcclUnitTesting
                           "ncclAllToAllv");
           break;
         case ncclCollSend:
-          if (collArg.userRegistered)
-            CHILD_NCCL_CALL_RANK(errCode, ncclCommRegister(this->comms[localRank], collArg.inputGpu.ptr, collArg.numInputBytesAllocated, &(collArg.commRegHandle)),"ncclCommRegister");
           CHILD_NCCL_CALL_RANK(errCode, ncclSend(
                                    collArg.inputGpu.ptr,
                                    collArg.numInputElements,
@@ -658,8 +658,6 @@ namespace RcclUnitTesting
                           "ncclSend");
           break;
         case ncclCollRecv:
-          if (collArg.userRegistered)
-            CHILD_NCCL_CALL_RANK(errCode, ncclCommRegister(this->comms[localRank], collArg.outputGpu.ptr, collArg.numOutputBytesAllocated, &(collArg.commRegHandle)), "ncclCommRegister");
           CHILD_NCCL_CALL_RANK(errCode, ncclRecv(
                                    collArg.outputGpu.ptr,
                                    collArg.numOutputElements,
@@ -891,8 +889,6 @@ namespace RcclUnitTesting
     for (int collIdx = 0; collIdx < collArgs[groupId][localRank].size(); ++collIdx)
     {
       CollectiveArgs& collArg = this->collArgs[groupId][localRank][collIdx];
-      if (collArg.userRegistered && (collArg.funcType == ncclCollSend || collArg.funcType == ncclCollRecv))
-        CHILD_NCCL_CALL(ncclCommDeregister(this->comms[localRank], collArg.commRegHandle), "ncclCommDeregister");
       if (collId == -1 || collId == collIdx)
       {
         if (this->verbose)
@@ -900,6 +896,10 @@ namespace RcclUnitTesting
           INFO("Child %d release memory for collective %d in group %d (Input: %p Output %p\n",
                this->childId, collIdx, groupId, collArg.inputGpu.ptr, collArg.outputGpu.ptr);
         }
+        if (collArg.userRegistered && (collArg.funcType == ncclCollSend || collArg.funcType == ncclCollRecv))
+        {
+          CHILD_NCCL_CALL(ncclCommDeregister(this->comms[localRank], collArg.commRegHandle), "ncclCommDeregister");
+        }
 
         CHECK_CALL(collArg.DeallocateMem());
       }