Nathan Harmer
Nathan Harmer

Reputation: 1

Cache Memory Loss on Exceeding Block Size

Introduction and Query I'm amending a GitHub repo to add a feature (HPTT). In a Transpose class, the function createPlan generates custom class computeNodes, summarised below (noting modified lines as //<-Mod and includes all print statements). On exiting the creation loop (where currentNode is initialised), the computeNodes contained within plan are corrupted where they equal or exceed 64 bytes. As I need to pass 72 bytes to computeNode, how can I pad or align the memory so I can access plan data later?

Code transpose.cpp:

template<typename floatType>
void Transpose<floatType>::createPlans( std::vector<std::shared_ptr<Plan> > &plans ) const
{
   //...
   // Code calculating parallelismStrategies and loopOrders (const int arrays)
   //...


   const int posStride1A_inB = 2; // Example I've used
   const int posStride1B_inA = 4;
   const int dim_ = 6; // Transpose properties marked by underscore
   int perm_[dim_] = {4, 2, 1, 0, 5, 3};
   int permB_[dim_] = {3, 2, 1, 5, 0, 4}
   int sizeA[dim_] = {2, 2, 2, 4, 5, 1};
   int lda_[dim_] = {1, 3, 30, 90, 1800, 18000};
   int ldb_[dim_] = {1, 20, 100, 700, 11900, 95200};
   int offsetA_[dim_] = {0, 5, 0, 7, 2, 6};
   int offsetB_[dim_] = {6, 0, 2, 4, 5, 0};
   int increment_[dim_] = {1, 1, 1, 1, 1, 1};
   int numThreads_ = 1; // in simple case

   const int numThreadsAtLoop = {1, 1, 1, 4, 1, 1};
   const int workPerThread = {0, 0, 0, 1, 0, 0};
   const int loopOrder = {0, 1, 2, 3, 4, 5};
   auto plan = std::make_shared<Plan>(loopOrder, numThreadsAtLoop); // See plan.cpp and plan.h 
   const int numTasks = plan->getNumTasks();

#ifdef _OPENMP
#pragma omp parallel for num_threads(numThreads_) if(numThreads_ > 1)
#endif
   for( int taskId = 0; taskId < numTasks; taskId++)
   {
      ComputeNode *currentNode = plan->getRootNode(taskId);

      for(int l=0; l < dim_; ++l){
         const int index = loopOrder[l];
         currentNode->inc = increment_[index];

         currentNode->indexA = (size_t)index;//<-Mod
         currentNode->indexB = (size_t)permB_[index];//<-Mod
         currentNode->start = std::min( sizeA_[index] + offsetB_[permB_[index]], (commId * workPerThread[index] * currentNode->inc) + offsetB_[permB_[index]] );//<-Mod (added offsetB_ term)
         currentNode->end = std::min( sizeA_[index] + offsetB_[permB_[index]], ((commId+1) * workPerThread[index] * currentNode->inc) + offsetB_[permB_[index]] );//<-Mod (added offsetB_ term)

         currentNode->lda = lda_[index];
         currentNode->ldb = ldb_[permB_[index]];

         currentNode->offDiffAB = (int)offsetA_[index] - (int)offsetB_[permB_[index]];//<-Mod
#ifdef DEBUG
         printf("(Task %d, Node %p) Level %d is IndexA %zu, IndexB %zu. Start: %zu, End: %zu, lda: %zu, ldb: %zu, offDiffAB: %d\n",taskId,currentNode,l,currentNode->indexA,currentNode->indexB,currentNode->start,currentNode->end,currentNode->lda,currentNode->ldb,currentNode->offDiffAB);
#endif

         if( perm_[0] != 0 || l != dim_-1 ){
            printf("Null pointer %p\n", currentNode->next);
            currentNode->next = new ComputeNode;
            printf("Assigned new (make shared pointer) ComputeNode at %p from %p\n", currentNode->next, currentNode);
            currentNode = currentNode->next;
         }
      }

      //macro-kernel
      if( perm_[0] != 0 )
      {
         currentNode->indexA = (size_t)posStride1B_inA;//<-Mod
         currentNode->indexB = (size_t)posStride1A_inB;//<-Mod
         currentNode->start = -1;
         currentNode->end = -1;
         currentNode->inc = -1;
         currentNode->lda = lda_[ posStride1B_inA ];
         currentNode->ldb = ldb_[ posStride1A_inB ];
         currentNode->offDiffAB = (int)offsetA_[ posStride1B_inA ] - (int)offsetB_[ posStride1A_inB ];//<-Mod
         currentNode->next = nullptr;
#ifdef DEBUG
         printf("    Adjust Node (%p) IndexA: %zu, IndexB: %zu, Start: %zu, End: %zu, lda: %zu, ldb: %zu, offDiffAB: %d\n",currentNode,currentNode->indexA,currentNode->indexB,currentNode->start,currentNode->end,currentNode->lda,currentNode->ldb,currentNode->offDiffAB);
#endif
      }
      auto checkNode = plan->getRootNode_const(taskId);//<-Mod
      while (checkNode->next != nullptr) {
         printf("Memory locations: %p (cur), %p (next); Types: %s, %s \n", checkNode, checkNode->next, typeid(*checkNode).name(), typeid(*checkNode->next).name());
         printf("IndexA %zu is IndexB %zu. Start: %zu, End: %zu, lda: %zu, ldb: %zu, offDiffAB: %d\n", checkNode->indexA, checkNode->indexB, checkNode->start, checkNode->end, checkNode->lda, checkNode->ldb, checkNode->offDiffAB);
         checkNode = checkNode->next;//<-Mod
      }
   }
   // Check plan data can be accessed
   for (int taskNum = 0; taskNum < numTasks; taskNum++)
   {
      auto currentNode = plan->getRootNode_const(taskNum);//<-Mod
      while (currentNode->next != nullptr) {
         printf("(Task %d) Memory locations: %p (cur), %p (next); Types: %s, %s \n", taskNum, currentNode, currentNode->next, typeid(*currentNode).name(), typeid(*currentNode->next).name());
         printf("IndexA %zu is IndexB %zu. Start: %zu, End: %zu, lda: %zu, ldb: %zu, offDiffAB: %d\n", currentNode->indexA, currentNode->indexB, currentNode->start, currentNode->end, currentNode->lda, currentNode->ldb, currentNode->offDiffAB);
         currentNode = currentNode->next;//<-Mod
      }
   }

   plans.push_back(plan);
}

computeNode.h:

class ComputeNode
{
   public:
      ComputeNode() : start(-1), end(-1), inc(-1), lda(-1), ldb(-1), indexA(0), indexB(0), offDiffAB(0), next(nullptr) {}//<-Mod

      ~ComputeNode() {
         if ( next != nullptr )
            delete next;
      }

   size_t start; //!< start index for at the current loop
   size_t end; //!< end index for at the current loop
   size_t inc; //!< increment for at the current loop
   size_t lda; //!< stride of A w.r.t. the loop index
   size_t ldb; //!< stride of B w.r.t. the loop index
   size_t indexA; //!< index of A (enabling us to determine if this is the innermost loop of matrix A)//<-Mod
   size_t indexB; //!< index of B (enabling us to determine if this is the innermost loop of matrix B)//<-Mod
   int    offDiffAB; //!< difference in offset A and B (i.e., A - B) at the current loop//<-Mod
   ComputeNode *next; //!< next ComputeNode, this might be another loop or 'nullptr' (i.e., indicating that the macro-kernel should be called)
};

Reference Only - Unmodified Files transpose.h:

class Transpose
{
//...
   private:
      void createPlans( std::vector<std::shared_ptr<Plan> > &plans ) const;
//...
}

plan.cpp:

   Plan::Plan(std::vector<int>loopOrder, std::vector<int>numThreadsAtLoop) : rootNodes_(nullptr), loopOrder_(loopOrder), numThreadsAtLoop_(numThreadsAtLoop) {
      numTasks_ = 1;
      for(auto nt : numThreadsAtLoop)
         numTasks_ *= nt;
      rootNodes_ = new ComputeNode[numTasks_];
   }

   Plan::~Plan() {
      if ( rootNodes_ != nullptr )
         delete[] rootNodes_;
   }

   const ComputeNode* Plan::getRootNode_const(int threadId) const { return &rootNodes_[threadId]; }
   ComputeNode* Plan::getRootNode(int threadId) const { return &rootNodes_[threadId]; }

plan.h:

class Plan
{
   public:
      Plan() : rootNodes_(nullptr), numTasks_(0) { }

      Plan(std::vector<int>loopOrder, std::vector<int>numThreadsAtLoop);

      ~Plan();


      const ComputeNode* getRootNode_const(int threadId) const;
      ComputeNode* getRootNode(int threadId) const;
      int getNumTasks() const { return numTasks_; } 

      void print() const; 

   private:
      int numTasks_;
      std::vector<int> loopOrder_; //!< loop order. For example, if \f$ B_{1,0,2} \gets A_{0,1,2}\f$. loopOrder_ = {1,0,2} denotes that B is travesed in a linear fashion.
      std::vector<int> numThreadsAtLoop_;
      ComputeNode *rootNodes_;
};

Highlights

Output I've checked the output with lldb (MacOS M2 user) which shows the destructor for computeNode is not called:

(Task 0, Node 0x137f04390) Level 0 is IndexA 0, IndexB 3. Start: 4, End: 6, lda: 1, ldb: 700, offDiffAB: -4
Null pointer 0xffffffffffffffff
Assigned new ComputeNode at 0x600000894080 from 0x137f04390
Compute Node Size: 128 (cur), 128 (next), 128 (gen)
(Task 0, Node 0x600000894080) Level 1 is IndexA 1, IndexB 2. Start: 2, End: 4, lda: 3, ldb: 100, offDiffAB: 3
Null pointer 0x0
Assigned new ComputeNode at 0x600000894100 from 0x600000894080
Compute Node Size: 128 (cur), 128 (next), 128 (gen)
(Task 0, Node 0x600000894100) Level 2 is IndexA 2, IndexB 1. Start: 0, End: 2, lda: 30, ldb: 20, offDiffAB: 0
Null pointer 0x0
Assigned new ComputeNode at 0x600000894180 from 0x600000894100
Compute Node Size: 128 (cur), 128 (next), 128 (gen)
(Task 0, Node 0x600000894180) Level 3 is IndexA 3, IndexB 5. Start: 0, End: 1, lda: 90, ldb: 95200, offDiffAB: 7
Null pointer 0x0
Assigned new ComputeNode at 0x600000894200 from 0x600000894180
Compute Node Size: 128 (cur), 128 (next), 128 (gen)
(Task 0, Node 0x600000894200) Level 4 is IndexA 4, IndexB 0. Start: 6, End: 11, lda: 1800, ldb: 1, offDiffAB: -4
Null pointer 0x0
Assigned new ComputeNode at 0x600000898000 from 0x600000894200
Compute Node Size: 128 (cur), 128 (next), 128 (gen)
(Task 0, Node 0x600000898000) Level 5 is IndexA 5, IndexB 4. Start: 5, End: 6, lda: 18000, ldb: 11900, offDiffAB: 1
Null pointer 0x0
Assigned new ComputeNode at 0x600000898080 from 0x600000898000
Compute Node Size: 128 (cur), 128 (next), 128 (gen)
    Adjust Node (0x600000898080) IndexA: 4, IndexB: 3, Start: 18446744073709551615, End: 18446744073709551615, lda: 1800, ldb: 700, offDiffAB: -2
Memory locations: 0x137f04390 (cur), 0x600000894080 (next); Types: N4hptt11ComputeNodeE, N4hptt11ComputeNodeE 
IndexA 0 is IndexB 3. Start: 4, End: 6, lda: 1, ldb: 700, offDiffAB: -4
Memory locations: 0x600000894080 (cur), 0x600000894100 (next); Types: N4hptt11ComputeNodeE, N4hptt11ComputeNodeE 
IndexA 1 is IndexB 2. Start: 2, End: 4, lda: 3, ldb: 100, offDiffAB: 3
Memory locations: 0x600000894100 (cur), 0x600000894180 (next); Types: N4hptt11ComputeNodeE, N4hptt11ComputeNodeE 
IndexA 2 is IndexB 1. Start: 0, End: 2, lda: 30, ldb: 20, offDiffAB: 0
Memory locations: 0x600000894180 (cur), 0x600000894200 (next); Types: N4hptt11ComputeNodeE, N4hptt11ComputeNodeE 
IndexA 3 is IndexB 5. Start: 0, End: 1, lda: 90, ldb: 95200, offDiffAB: 7
Memory locations: 0x600000894200 (cur), 0x600000898000 (next); Types: N4hptt11ComputeNodeE, N4hptt11ComputeNodeE 
IndexA 4 is IndexB 0. Start: 6, End: 11, lda: 1800, ldb: 1, offDiffAB: -4
Memory locations: 0x600000898000 (cur), 0x600000898080 (next); Types: N4hptt11ComputeNodeE, N4hptt11ComputeNodeE 
IndexA 5 is IndexB 4. Start: 5, End: 6, lda: 18000, ldb: 11900, offDiffAB: 1

//...
// Tasks 1-4 Inclusive (same but Level 3 increments start and end by 1)
//...

Compute Node Size: 128 (cur), 128 (next), 128 (gen)
Size t size 8, int size 8, ComputeNode Pointer size 8, nullptr size 8
(Task: 0) Memory locations: 0x137f04390 (cur), 0x6 (next); Types: N4hptt11ComputeNodeE, N4hptt11ComputeNodeE 
IndexA 0 is IndexB 3. Start: 4, End: 6, lda: 1, ldb: 700, offDiffAB: 4
Process 96167 stopped
* thread #1, queue = 'com.apple.main-thread', stop reason = EXC_BAD_ACCESS (code=1, address=0x46)
    frame #0: 0x0000000100684ddc libhptt.so`hptt::Transpose<float>::createPlans(std::__1::vector<std::__1::shared_ptr<hptt::Plan>, std::__1::allocator<std::__1::shared_ptr<hptt::Plan>>>&) const + 2892
libhptt.so`hptt::Transpose<float>::createPlans:
->  0x100684ddc <+2892>: ldr    x8, [x26, #0x40]
    0x100684de0 <+2896>: cbnz   x8, 0x100684d7c           ; <+2796>
    0x100684de4 <+2900>: add    w19, w19, #0x1
    0x100684de8 <+2904>: ldr    w8, [sp, #0xa4]
Target 0: (testframework.exe) stopped.

Upvotes: 0

Views: 38

Answers (0)

Related Questions