Cache Memory Loss on Exceeding Block Size

Question

Introduction and Query I'm amending a GitHub repo to add a feature (HPTT). In a Transpose class, the function createPlan generates custom class computeNodes, summarised below (noting modified lines as //<-Mod and includes all print statements). On exiting the creation loop (where currentNode is initialised), the computeNodes contained within plan are corrupted where they equal or exceed 64 bytes. As I need to pass 72 bytes to computeNode, how can I pad or align the memory so I can access plan data later?

Code transpose.cpp:

template
void Transpose::createPlans( std::vector > &plans ) const
{
   //...
   // Code calculating parallelismStrategies and loopOrders (const int arrays)
   //...


   const int posStride1A_inB = 2; // Example I've used
   const int posStride1B_inA = 4;
   const int dim_ = 6; // Transpose properties marked by underscore
   int perm_[dim_] = {4, 2, 1, 0, 5, 3};
   int permB_[dim_] = {3, 2, 1, 5, 0, 4}
   int sizeA[dim_] = {2, 2, 2, 4, 5, 1};
   int lda_[dim_] = {1, 3, 30, 90, 1800, 18000};
   int ldb_[dim_] = {1, 20, 100, 700, 11900, 95200};
   int offsetA_[dim_] = {0, 5, 0, 7, 2, 6};
   int offsetB_[dim_] = {6, 0, 2, 4, 5, 0};
   int increment_[dim_] = {1, 1, 1, 1, 1, 1};
   int numThreads_ = 1; // in simple case

   const int numThreadsAtLoop = {1, 1, 1, 4, 1, 1};
   const int workPerThread = {0, 0, 0, 1, 0, 0};
   const int loopOrder = {0, 1, 2, 3, 4, 5};
   auto plan = std::make_shared(loopOrder, numThreadsAtLoop); // See plan.cpp and plan.h 
   const int numTasks = plan->getNumTasks();

#ifdef _OPENMP
#pragma omp parallel for num_threads(numThreads_) if(numThreads_ > 1)
#endif
   for( int taskId = 0; taskId < numTasks; taskId++)
   {
      ComputeNode *currentNode = plan->getRootNode(taskId);

      for(int l=0; l < dim_; ++l){
         const int index = loopOrder[l];
         currentNode->inc = increment_[index];

         currentNode->indexA = (size_t)index;//<-Mod
         currentNode->indexB = (size_t)permB_[index];//<-Mod
         currentNode->start = std::min( sizeA_[index] + offsetB_[permB_[index]], (commId * workPerThread[index] * currentNode->inc) + offsetB_[permB_[index]] );//<-Mod (added offsetB_ term)
         currentNode->end = std::min( sizeA_[index] + offsetB_[permB_[index]], ((commId+1) * workPerThread[index] * currentNode->inc) + offsetB_[permB_[index]] );//<-Mod (added offsetB_ term)

         currentNode->lda = lda_[index];
         currentNode->ldb = ldb_[permB_[index]];

         currentNode->offDiffAB = (int)offsetA_[index] - (int)offsetB_[permB_[index]];//<-Mod
#ifdef DEBUG
         printf("(Task %d, Node %p) Level %d is IndexA %zu, IndexB %zu. Start: %zu, End: %zu, lda: %zu, ldb: %zu, offDiffAB: %d
",taskId,currentNode,l,currentNode->indexA,currentNode->indexB,currentNode->start,currentNode->end,currentNode->lda,currentNode->ldb,currentNode->offDiffAB);
#endif

         if( perm_[0] != 0 || l != dim_-1 ){
            printf("Null pointer %p
", currentNode->next);
            currentNode->next = new ComputeNode;
            printf("Assigned new (make shared pointer) ComputeNode at %p from %p
", currentNode->next, currentNode);
            currentNode = currentNode->next;
         }
      }

      //macro-kernel
      if( perm_[0] != 0 )
      {
         currentNode->indexA = (size_t)posStride1B_inA;//<-Mod
         currentNode->indexB = (size_t)posStride1A_inB;//<-Mod
         currentNode->start = -1;
         currentNode->end = -1;
         currentNode->inc = -1;
         currentNode->lda = lda_[ posStride1B_inA ];
         currentNode->ldb = ldb_[ posStride1A_inB ];
         currentNode->offDiffAB = (int)offsetA_[ posStride1B_inA ] - (int)offsetB_[ posStride1A_inB ];//<-Mod
         currentNode->next = nullptr;
#ifdef DEBUG
         printf("    Adjust Node (%p) IndexA: %zu, IndexB: %zu, Start: %zu, End: %zu, lda: %zu, ldb: %zu, offDiffAB: %d
",currentNode,currentNode->indexA,currentNode->indexB,currentNode->start,currentNode->end,currentNode->lda,currentNode->ldb,currentNode->offDiffAB);
#endif
      }
      auto checkNode = plan->getRootNode_const(taskId);//<-Mod
      while (checkNode->next != nullptr) {
         printf("Memory locations: %p (cur), %p (next); Types: %s, %s 
", checkNode, checkNode->next, typeid(*checkNode).name(), typeid(*checkNode->next).name());
         printf("IndexA %zu is IndexB %zu. Start: %zu, End: %zu, lda: %zu, ldb: %zu, offDiffAB: %d
", checkNode->indexA, checkNode->indexB, checkNode->start, checkNode->end, checkNode->lda, checkNode->ldb, checkNode->offDiffAB);
         checkNode = checkNode->next;//<-Mod
      }
   }
   // Check plan data can be accessed
   for (int taskNum = 0; taskNum < numTasks; taskNum++)
   {
      auto currentNode = plan->getRootNode_const(taskNum);//<-Mod
      while (currentNode->next != nullptr) {
         printf("(Task %d) Memory locations: %p (cur), %p (next); Types: %s, %s 
", taskNum, currentNode, currentNode->next, typeid(*currentNode).name(), typeid(*currentNode->next).name());
         printf("IndexA %zu is IndexB %zu. Start: %zu, End: %zu, lda: %zu, ldb: %zu, offDiffAB: %d
", currentNode->indexA, currentNode->indexB, currentNode->start, currentNode->end, currentNode->lda, currentNode->ldb, currentNode->offDiffAB);
         currentNode = currentNode->next;//<-Mod
      }
   }

   plans.push_back(plan);
}

computeNode.h:

class ComputeNode
{
   public:
      ComputeNode() : start(-1), end(-1), inc(-1), lda(-1), ldb(-1), indexA(0), indexB(0), offDiffAB(0), next(nullptr) {}//<-Mod

      ~ComputeNode() {
         if ( next != nullptr )
            delete next;
      }

   size_t start; //!< start index for at the current loop
   size_t end; //!< end index for at the current loop
   size_t inc; //!< increment for at the current loop
   size_t lda; //!< stride of A w.r.t. the loop index
   size_t ldb; //!< stride of B w.r.t. the loop index
   size_t indexA; //!< index of A (enabling us to determine if this is the innermost loop of matrix A)//<-Mod
   size_t indexB; //!< index of B (enabling us to determine if this is the innermost loop of matrix B)//<-Mod
   int    offDiffAB; //!< difference in offset A and B (i.e., A - B) at the current loop//<-Mod
   ComputeNode *next; //!< next ComputeNode, this might be another loop or 'nullptr' (i.e., indicating that the macro-kernel should be called)
};

Reference Only - Unmodified Files transpose.h:

class Transpose
{
//...
   private:
      void createPlans( std::vector > &plans ) const;
//...
}

plan.cpp:

   Plan::Plan(std::vectorloopOrder, std::vectornumThreadsAtLoop) : rootNodes_(nullptr), loopOrder_(loopOrder), numThreadsAtLoop_(numThreadsAtLoop) {
      numTasks_ = 1;
      for(auto nt : numThreadsAtLoop)
         numTasks_ *= nt;
      rootNodes_ = new ComputeNode[numTasks_];
   }

   Plan::~Plan() {
      if ( rootNodes_ != nullptr )
         delete[] rootNodes_;
   }

   const ComputeNode* Plan::getRootNode_const(int threadId) const { return &rootNodes_[threadId]; }
   ComputeNode* Plan::getRootNode(int threadId) const { return &rootNodes_[threadId]; }

plan.h:

class Plan
{
   public:
      Plan() : rootNodes_(nullptr), numTasks_(0) { }

      Plan(std::vectorloopOrder, std::vectornumThreadsAtLoop);

      ~Plan();


      const ComputeNode* getRootNode_const(int threadId) const;
      ComputeNode* getRootNode(int threadId) const;
      int getNumTasks() const { return numTasks_; } 

      void print() const; 

   private:
      int numTasks_;
      std::vector loopOrder_; //!< loop order. For example, if \f$ B_{1,0,2} \gets A_{0,1,2}\f$. loopOrder_ = {1,0,2} denotes that B is travesed in a linear fashion.
      std::vector numThreadsAtLoop_;
      ComputeNode *rootNodes_;
};

Highlights

Without the two new terms (indexA, indexB), memory uncorrupted - 56 bytes per computeNode.
Less than 64 bytes in any size is fine.
Where data exceeds 64 bytes data is read from the start of that block again. (offDiffAB reads as start)
I don't have any ASAN or compiler (clang) warnings or errors.

Output I've checked the output with lldb (MacOS M2 user) which shows the destructor for computeNode is not called:

(Task 0, Node 0x137f04390) Level 0 is IndexA 0, IndexB 3. Start: 4, End: 6, lda: 1, ldb: 700, offDiffAB: -4
Null pointer 0xffffffffffffffff
Assigned new ComputeNode at 0x600000894080 from 0x137f04390
Compute Node Size: 128 (cur), 128 (next), 128 (gen)
(Task 0, Node 0x600000894080) Level 1 is IndexA 1, IndexB 2. Start: 2, End: 4, lda: 3, ldb: 100, offDiffAB: 3
Null pointer 0x0
Assigned new ComputeNode at 0x600000894100 from 0x600000894080
Compute Node Size: 128 (cur), 128 (next), 128 (gen)
(Task 0, Node 0x600000894100) Level 2 is IndexA 2, IndexB 1. Start: 0, End: 2, lda: 30, ldb: 20, offDiffAB: 0
Null pointer 0x0
Assigned new ComputeNode at 0x600000894180 from 0x600000894100
Compute Node Size: 128 (cur), 128 (next), 128 (gen)
(Task 0, Node 0x600000894180) Level 3 is IndexA 3, IndexB 5. Start: 0, End: 1, lda: 90, ldb: 95200, offDiffAB: 7
Null pointer 0x0
Assigned new ComputeNode at 0x600000894200 from 0x600000894180
Compute Node Size: 128 (cur), 128 (next), 128 (gen)
(Task 0, Node 0x600000894200) Level 4 is IndexA 4, IndexB 0. Start: 6, End: 11, lda: 1800, ldb: 1, offDiffAB: -4
Null pointer 0x0
Assigned new ComputeNode at 0x600000898000 from 0x600000894200
Compute Node Size: 128 (cur), 128 (next), 128 (gen)
(Task 0, Node 0x600000898000) Level 5 is IndexA 5, IndexB 4. Start: 5, End: 6, lda: 18000, ldb: 11900, offDiffAB: 1
Null pointer 0x0
Assigned new ComputeNode at 0x600000898080 from 0x600000898000
Compute Node Size: 128 (cur), 128 (next), 128 (gen)
    Adjust Node (0x600000898080) IndexA: 4, IndexB: 3, Start: 18446744073709551615, End: 18446744073709551615, lda: 1800, ldb: 700, offDiffAB: -2
Memory locations: 0x137f04390 (cur), 0x600000894080 (next); Types: N4hptt11ComputeNodeE, N4hptt11ComputeNodeE 
IndexA 0 is IndexB 3. Start: 4, End: 6, lda: 1, ldb: 700, offDiffAB: -4
Memory locations: 0x600000894080 (cur), 0x600000894100 (next); Types: N4hptt11ComputeNodeE, N4hptt11ComputeNodeE 
IndexA 1 is IndexB 2. Start: 2, End: 4, lda: 3, ldb: 100, offDiffAB: 3
Memory locations: 0x600000894100 (cur), 0x600000894180 (next); Types: N4hptt11ComputeNodeE, N4hptt11ComputeNodeE 
IndexA 2 is IndexB 1. Start: 0, End: 2, lda: 30, ldb: 20, offDiffAB: 0
Memory locations: 0x600000894180 (cur), 0x600000894200 (next); Types: N4hptt11ComputeNodeE, N4hptt11ComputeNodeE 
IndexA 3 is IndexB 5. Start: 0, End: 1, lda: 90, ldb: 95200, offDiffAB: 7
Memory locations: 0x600000894200 (cur), 0x600000898000 (next); Types: N4hptt11ComputeNodeE, N4hptt11ComputeNodeE 
IndexA 4 is IndexB 0. Start: 6, End: 11, lda: 1800, ldb: 1, offDiffAB: -4
Memory locations: 0x600000898000 (cur), 0x600000898080 (next); Types: N4hptt11ComputeNodeE, N4hptt11ComputeNodeE 
IndexA 5 is IndexB 4. Start: 5, End: 6, lda: 18000, ldb: 11900, offDiffAB: 1

//...
// Tasks 1-4 Inclusive (same but Level 3 increments start and end by 1)
//...

Compute Node Size: 128 (cur), 128 (next), 128 (gen)
Size t size 8, int size 8, ComputeNode Pointer size 8, nullptr size 8
(Task: 0) Memory locations: 0x137f04390 (cur), 0x6 (next); Types: N4hptt11ComputeNodeE, N4hptt11ComputeNodeE 
IndexA 0 is IndexB 3. Start: 4, End: 6, lda: 1, ldb: 700, offDiffAB: 4
Process 96167 stopped
* thread #1, queue = 'com.apple.main-thread', stop reason = EXC_BAD_ACCESS (code=1, address=0x46)
    frame #0: 0x0000000100684ddc libhptt.so`hptt::Transpose::createPlans(std::__1::vector, std::__1::allocator>>&) const + 2892
libhptt.so`hptt::Transpose::createPlans:
->  0x100684ddc <+2892>: ldr    x8, [x26, #0x40]
    0x100684de0 <+2896>: cbnz   x8, 0x100684d7c           ; <+2796>
    0x100684de4 <+2900>: add    w19, w19, #0x1
    0x100684de8 <+2904>: ldr    w8, [sp, #0xa4]
Target 0: (testframework.exe) stopped.

Cache Memory Loss on Exceeding Block Size

Answers (0)

Related Questions