Reputation: 1
Introduction and Query
I'm amending a GitHub repo to add a feature (HPTT). In a Transpose
class, the function createPlan
generates custom class computeNode
s, summarised below (noting modified lines as //<-Mod and includes all print statements). On exiting the creation loop (where currentNode
is initialised), the computeNode
s contained within plan
are corrupted where they equal or exceed 64 bytes. As I need to pass 72 bytes to computeNode
, how can I pad or align the memory so I can access plan
data later?
Code transpose.cpp:
template<typename floatType>
void Transpose<floatType>::createPlans( std::vector<std::shared_ptr<Plan> > &plans ) const
{
//...
// Code calculating parallelismStrategies and loopOrders (const int arrays)
//...
const int posStride1A_inB = 2; // Example I've used
const int posStride1B_inA = 4;
const int dim_ = 6; // Transpose properties marked by underscore
int perm_[dim_] = {4, 2, 1, 0, 5, 3};
int permB_[dim_] = {3, 2, 1, 5, 0, 4}
int sizeA[dim_] = {2, 2, 2, 4, 5, 1};
int lda_[dim_] = {1, 3, 30, 90, 1800, 18000};
int ldb_[dim_] = {1, 20, 100, 700, 11900, 95200};
int offsetA_[dim_] = {0, 5, 0, 7, 2, 6};
int offsetB_[dim_] = {6, 0, 2, 4, 5, 0};
int increment_[dim_] = {1, 1, 1, 1, 1, 1};
int numThreads_ = 1; // in simple case
const int numThreadsAtLoop = {1, 1, 1, 4, 1, 1};
const int workPerThread = {0, 0, 0, 1, 0, 0};
const int loopOrder = {0, 1, 2, 3, 4, 5};
auto plan = std::make_shared<Plan>(loopOrder, numThreadsAtLoop); // See plan.cpp and plan.h
const int numTasks = plan->getNumTasks();
#ifdef _OPENMP
#pragma omp parallel for num_threads(numThreads_) if(numThreads_ > 1)
#endif
for( int taskId = 0; taskId < numTasks; taskId++)
{
ComputeNode *currentNode = plan->getRootNode(taskId);
for(int l=0; l < dim_; ++l){
const int index = loopOrder[l];
currentNode->inc = increment_[index];
currentNode->indexA = (size_t)index;//<-Mod
currentNode->indexB = (size_t)permB_[index];//<-Mod
currentNode->start = std::min( sizeA_[index] + offsetB_[permB_[index]], (commId * workPerThread[index] * currentNode->inc) + offsetB_[permB_[index]] );//<-Mod (added offsetB_ term)
currentNode->end = std::min( sizeA_[index] + offsetB_[permB_[index]], ((commId+1) * workPerThread[index] * currentNode->inc) + offsetB_[permB_[index]] );//<-Mod (added offsetB_ term)
currentNode->lda = lda_[index];
currentNode->ldb = ldb_[permB_[index]];
currentNode->offDiffAB = (int)offsetA_[index] - (int)offsetB_[permB_[index]];//<-Mod
#ifdef DEBUG
printf("(Task %d, Node %p) Level %d is IndexA %zu, IndexB %zu. Start: %zu, End: %zu, lda: %zu, ldb: %zu, offDiffAB: %d\n",taskId,currentNode,l,currentNode->indexA,currentNode->indexB,currentNode->start,currentNode->end,currentNode->lda,currentNode->ldb,currentNode->offDiffAB);
#endif
if( perm_[0] != 0 || l != dim_-1 ){
printf("Null pointer %p\n", currentNode->next);
currentNode->next = new ComputeNode;
printf("Assigned new (make shared pointer) ComputeNode at %p from %p\n", currentNode->next, currentNode);
currentNode = currentNode->next;
}
}
//macro-kernel
if( perm_[0] != 0 )
{
currentNode->indexA = (size_t)posStride1B_inA;//<-Mod
currentNode->indexB = (size_t)posStride1A_inB;//<-Mod
currentNode->start = -1;
currentNode->end = -1;
currentNode->inc = -1;
currentNode->lda = lda_[ posStride1B_inA ];
currentNode->ldb = ldb_[ posStride1A_inB ];
currentNode->offDiffAB = (int)offsetA_[ posStride1B_inA ] - (int)offsetB_[ posStride1A_inB ];//<-Mod
currentNode->next = nullptr;
#ifdef DEBUG
printf(" Adjust Node (%p) IndexA: %zu, IndexB: %zu, Start: %zu, End: %zu, lda: %zu, ldb: %zu, offDiffAB: %d\n",currentNode,currentNode->indexA,currentNode->indexB,currentNode->start,currentNode->end,currentNode->lda,currentNode->ldb,currentNode->offDiffAB);
#endif
}
auto checkNode = plan->getRootNode_const(taskId);//<-Mod
while (checkNode->next != nullptr) {
printf("Memory locations: %p (cur), %p (next); Types: %s, %s \n", checkNode, checkNode->next, typeid(*checkNode).name(), typeid(*checkNode->next).name());
printf("IndexA %zu is IndexB %zu. Start: %zu, End: %zu, lda: %zu, ldb: %zu, offDiffAB: %d\n", checkNode->indexA, checkNode->indexB, checkNode->start, checkNode->end, checkNode->lda, checkNode->ldb, checkNode->offDiffAB);
checkNode = checkNode->next;//<-Mod
}
}
// Check plan data can be accessed
for (int taskNum = 0; taskNum < numTasks; taskNum++)
{
auto currentNode = plan->getRootNode_const(taskNum);//<-Mod
while (currentNode->next != nullptr) {
printf("(Task %d) Memory locations: %p (cur), %p (next); Types: %s, %s \n", taskNum, currentNode, currentNode->next, typeid(*currentNode).name(), typeid(*currentNode->next).name());
printf("IndexA %zu is IndexB %zu. Start: %zu, End: %zu, lda: %zu, ldb: %zu, offDiffAB: %d\n", currentNode->indexA, currentNode->indexB, currentNode->start, currentNode->end, currentNode->lda, currentNode->ldb, currentNode->offDiffAB);
currentNode = currentNode->next;//<-Mod
}
}
plans.push_back(plan);
}
computeNode.h:
class ComputeNode
{
public:
ComputeNode() : start(-1), end(-1), inc(-1), lda(-1), ldb(-1), indexA(0), indexB(0), offDiffAB(0), next(nullptr) {}//<-Mod
~ComputeNode() {
if ( next != nullptr )
delete next;
}
size_t start; //!< start index for at the current loop
size_t end; //!< end index for at the current loop
size_t inc; //!< increment for at the current loop
size_t lda; //!< stride of A w.r.t. the loop index
size_t ldb; //!< stride of B w.r.t. the loop index
size_t indexA; //!< index of A (enabling us to determine if this is the innermost loop of matrix A)//<-Mod
size_t indexB; //!< index of B (enabling us to determine if this is the innermost loop of matrix B)//<-Mod
int offDiffAB; //!< difference in offset A and B (i.e., A - B) at the current loop//<-Mod
ComputeNode *next; //!< next ComputeNode, this might be another loop or 'nullptr' (i.e., indicating that the macro-kernel should be called)
};
Reference Only - Unmodified Files transpose.h:
class Transpose
{
//...
private:
void createPlans( std::vector<std::shared_ptr<Plan> > &plans ) const;
//...
}
plan.cpp:
Plan::Plan(std::vector<int>loopOrder, std::vector<int>numThreadsAtLoop) : rootNodes_(nullptr), loopOrder_(loopOrder), numThreadsAtLoop_(numThreadsAtLoop) {
numTasks_ = 1;
for(auto nt : numThreadsAtLoop)
numTasks_ *= nt;
rootNodes_ = new ComputeNode[numTasks_];
}
Plan::~Plan() {
if ( rootNodes_ != nullptr )
delete[] rootNodes_;
}
const ComputeNode* Plan::getRootNode_const(int threadId) const { return &rootNodes_[threadId]; }
ComputeNode* Plan::getRootNode(int threadId) const { return &rootNodes_[threadId]; }
plan.h:
class Plan
{
public:
Plan() : rootNodes_(nullptr), numTasks_(0) { }
Plan(std::vector<int>loopOrder, std::vector<int>numThreadsAtLoop);
~Plan();
const ComputeNode* getRootNode_const(int threadId) const;
ComputeNode* getRootNode(int threadId) const;
int getNumTasks() const { return numTasks_; }
void print() const;
private:
int numTasks_;
std::vector<int> loopOrder_; //!< loop order. For example, if \f$ B_{1,0,2} \gets A_{0,1,2}\f$. loopOrder_ = {1,0,2} denotes that B is travesed in a linear fashion.
std::vector<int> numThreadsAtLoop_;
ComputeNode *rootNodes_;
};
Highlights
offDiffAB
reads as start
)Output I've checked the output with lldb (MacOS M2 user) which shows the destructor for computeNode is not called:
(Task 0, Node 0x137f04390) Level 0 is IndexA 0, IndexB 3. Start: 4, End: 6, lda: 1, ldb: 700, offDiffAB: -4
Null pointer 0xffffffffffffffff
Assigned new ComputeNode at 0x600000894080 from 0x137f04390
Compute Node Size: 128 (cur), 128 (next), 128 (gen)
(Task 0, Node 0x600000894080) Level 1 is IndexA 1, IndexB 2. Start: 2, End: 4, lda: 3, ldb: 100, offDiffAB: 3
Null pointer 0x0
Assigned new ComputeNode at 0x600000894100 from 0x600000894080
Compute Node Size: 128 (cur), 128 (next), 128 (gen)
(Task 0, Node 0x600000894100) Level 2 is IndexA 2, IndexB 1. Start: 0, End: 2, lda: 30, ldb: 20, offDiffAB: 0
Null pointer 0x0
Assigned new ComputeNode at 0x600000894180 from 0x600000894100
Compute Node Size: 128 (cur), 128 (next), 128 (gen)
(Task 0, Node 0x600000894180) Level 3 is IndexA 3, IndexB 5. Start: 0, End: 1, lda: 90, ldb: 95200, offDiffAB: 7
Null pointer 0x0
Assigned new ComputeNode at 0x600000894200 from 0x600000894180
Compute Node Size: 128 (cur), 128 (next), 128 (gen)
(Task 0, Node 0x600000894200) Level 4 is IndexA 4, IndexB 0. Start: 6, End: 11, lda: 1800, ldb: 1, offDiffAB: -4
Null pointer 0x0
Assigned new ComputeNode at 0x600000898000 from 0x600000894200
Compute Node Size: 128 (cur), 128 (next), 128 (gen)
(Task 0, Node 0x600000898000) Level 5 is IndexA 5, IndexB 4. Start: 5, End: 6, lda: 18000, ldb: 11900, offDiffAB: 1
Null pointer 0x0
Assigned new ComputeNode at 0x600000898080 from 0x600000898000
Compute Node Size: 128 (cur), 128 (next), 128 (gen)
Adjust Node (0x600000898080) IndexA: 4, IndexB: 3, Start: 18446744073709551615, End: 18446744073709551615, lda: 1800, ldb: 700, offDiffAB: -2
Memory locations: 0x137f04390 (cur), 0x600000894080 (next); Types: N4hptt11ComputeNodeE, N4hptt11ComputeNodeE
IndexA 0 is IndexB 3. Start: 4, End: 6, lda: 1, ldb: 700, offDiffAB: -4
Memory locations: 0x600000894080 (cur), 0x600000894100 (next); Types: N4hptt11ComputeNodeE, N4hptt11ComputeNodeE
IndexA 1 is IndexB 2. Start: 2, End: 4, lda: 3, ldb: 100, offDiffAB: 3
Memory locations: 0x600000894100 (cur), 0x600000894180 (next); Types: N4hptt11ComputeNodeE, N4hptt11ComputeNodeE
IndexA 2 is IndexB 1. Start: 0, End: 2, lda: 30, ldb: 20, offDiffAB: 0
Memory locations: 0x600000894180 (cur), 0x600000894200 (next); Types: N4hptt11ComputeNodeE, N4hptt11ComputeNodeE
IndexA 3 is IndexB 5. Start: 0, End: 1, lda: 90, ldb: 95200, offDiffAB: 7
Memory locations: 0x600000894200 (cur), 0x600000898000 (next); Types: N4hptt11ComputeNodeE, N4hptt11ComputeNodeE
IndexA 4 is IndexB 0. Start: 6, End: 11, lda: 1800, ldb: 1, offDiffAB: -4
Memory locations: 0x600000898000 (cur), 0x600000898080 (next); Types: N4hptt11ComputeNodeE, N4hptt11ComputeNodeE
IndexA 5 is IndexB 4. Start: 5, End: 6, lda: 18000, ldb: 11900, offDiffAB: 1
//...
// Tasks 1-4 Inclusive (same but Level 3 increments start and end by 1)
//...
Compute Node Size: 128 (cur), 128 (next), 128 (gen)
Size t size 8, int size 8, ComputeNode Pointer size 8, nullptr size 8
(Task: 0) Memory locations: 0x137f04390 (cur), 0x6 (next); Types: N4hptt11ComputeNodeE, N4hptt11ComputeNodeE
IndexA 0 is IndexB 3. Start: 4, End: 6, lda: 1, ldb: 700, offDiffAB: 4
Process 96167 stopped
* thread #1, queue = 'com.apple.main-thread', stop reason = EXC_BAD_ACCESS (code=1, address=0x46)
frame #0: 0x0000000100684ddc libhptt.so`hptt::Transpose<float>::createPlans(std::__1::vector<std::__1::shared_ptr<hptt::Plan>, std::__1::allocator<std::__1::shared_ptr<hptt::Plan>>>&) const + 2892
libhptt.so`hptt::Transpose<float>::createPlans:
-> 0x100684ddc <+2892>: ldr x8, [x26, #0x40]
0x100684de0 <+2896>: cbnz x8, 0x100684d7c ; <+2796>
0x100684de4 <+2900>: add w19, w19, #0x1
0x100684de8 <+2904>: ldr w8, [sp, #0xa4]
Target 0: (testframework.exe) stopped.
Upvotes: 0
Views: 38