Reputation: 1557
I'm helping my friend to optimize his memory for this program:
#define _GNU_SOURCE
#include "myutils.h"
#include <pthread.h>
void * mySPMDMain(void *);
pthread_barrier_t mybarrier;
int main (int argc, char ** argv) {
/* Initialize global data here */
/* Start threads*/
int i;
if (argc != 3)
{
fprintf(stderr, "usage: ./psrs <Number of Keys> <Number of threads>\n");
exit(EXIT_FAILURE);
}
struct timeval start;
struct timeval end;
int N = atoi(argv[1]);
int NUM_THREADS = atoi(argv[2]);
int allChunkSize = N/NUM_THREADS;
pthread_t ThreadID[NUM_THREADS];
pthread_barrier_init(&mybarrier, NULL, NUM_THREADS);
long int * originArray = (long int *) malloc(sizeof(long int)*N);
for (i = 0; i<N; ++i)
{
originArray[i] = random() % 100000;
}
/*Initialize the Data Space*/
TCB *myTCB = (TCB *) malloc(sizeof(TCB) *NUM_THREADS);
for (i = 0; i<NUM_THREADS; ++i)
{
myTCB[i].Chunk = (long int*) malloc(sizeof(long int)*allChunkSize);
myTCB[i].passLength = (int*) malloc(sizeof(int)*NUM_THREADS);
myTCB[i].samples = (long int*) malloc(sizeof(long int)*NUM_THREADS);
myTCB[i].tmpMergeSpace = (long int**) malloc(sizeof(long int *)*NUM_THREADS);
myTCB[i].pivotArray = (long int *) malloc(sizeof(long int)*NUM_THREADS*NUM_THREADS);
myTCB[i].selectedPivot = (long int*) malloc(sizeof(long int)*(NUM_THREADS-1));
myTCB[i].eachStartIndex = (int*) malloc(sizeof(int)*NUM_THREADS);
myTCB[i].mergeLength = (int *) malloc(sizeof(int)*NUM_THREADS);
myTCB[i].sampleIndex = calloc((NUM_THREADS-1), sizeof(int));
memcpy(myTCB[i].Chunk, &originArray[i*allChunkSize],allChunkSize*sizeof(long int));
myTCB[i].N = N;
myTCB[i].num_threads = NUM_THREADS;
myTCB[i].ChunkSize = allChunkSize;
myTCB[i].offSet = N/(NUM_THREADS * NUM_THREADS);
}
gettimeofday(&start, NULL);
for (i = 1; i < NUM_THREADS; ++i)
{
myTCB[i].pid = i;
pthread_create(&(ThreadID[i]),NULL, mySPMDMain, (void*) &(myTCB[i]));
}
myTCB[0].pid = 0;
mySPMDMain((void *) &(myTCB[0]));
for (i = 1; i<NUM_THREADS; ++i)
{
pthread_join(ThreadID[i], NULL);
}
gettimeofday(&end, NULL);
double time_spent = (double)(end.tv_sec - start.tv_sec) * 1.0e6 + (double) (end.tv_usec - start.tv_usec);
time_spent = time_spent /1000000;
//printf("time spent: %f\n", time_spent);
memset(originArray, 0, N*sizeof(long int));
int cursor = 0;
for (i = 0; i<NUM_THREADS; ++i)
{
memcpy(&originArray[cursor], myTCB[i].resultArr, sumTotal(myTCB[i].mergeLength, NUM_THREADS) * sizeof(long int));
cursor = cursor+sumTotal(myTCB[i].mergeLength, NUM_THREADS);
}
/* Clean up and exit*/
pthread_barrier_destroy(&mybarrier);
assert(isSorted(originArray,N) == 1);
for (i = 0; i<NUM_THREADS; ++i)
{
free(myTCB[i].mergeLength);
free(myTCB[i].eachStartIndex);
free(myTCB[i].sampleIndex);
free(myTCB[i].selectedPivot);
free(myTCB[i].pivotArray);
free(myTCB[i].samples);
free(myTCB[i].passLength);
free(myTCB[i].Chunk);
free(myTCB[i].resultArr);
for(int j = 0; j< NUM_THREADS; j++)
{
free(myTCB[i].tmpMergeSpace[j]);
}
free(myTCB[i].tmpMergeSpace);
}
free(myTCB);
free(originArray);
return 0;
}
#define MASTER if(localId == 0)
#define BARRIER pthread_barrier_wait(&mybarrier)
void * mySPMDMain(void *arg)
{
TCB * localTCB;
int localId;
/* Actual parameter */
localTCB = (TCB *)arg;
/* Other parameters passed in via global */
localId = localTCB -> pid;
/* Parallel array to TCB */
qsort(localTCB -> Chunk, localTCB -> ChunkSize, sizeof(long int), cmpfunc);
BARRIER;
// Timing
/* Phase 1 */
for (int i = 0; i< localTCB -> num_threads; ++i)
{
long int sample = localTCB->Chunk[i*localTCB -> offSet];
localTCB->samples[i]=sample;
}
BARRIER;
/* Phase 2 */
MASTER {
for (int i = 0; i < localTCB -> num_threads; ++i)
{
memcpy(&(localTCB->pivotArray[i*localTCB -> num_threads]),localTCB[i].samples, localTCB -> num_threads*sizeof(long int));
}
qsort(localTCB -> pivotArray, localTCB -> num_threads*localTCB -> num_threads, sizeof(long int), cmpfunc);
for (int i = 0; i<(localTCB -> num_threads)-1; ++i)
{
localTCB-> selectedPivot[i] = localTCB-> pivotArray[(i+1)*localTCB -> num_threads];
}
for (int i = 1; i<localTCB -> num_threads; ++i)
{
memcpy(localTCB[i].selectedPivot,localTCB[0].selectedPivot,((localTCB -> num_threads)-1)*sizeof(long int));
}
}
BARRIER;
/* Phase 3 */
for (int i = 0; i<(localTCB -> num_threads)-1; ++i)
{
localTCB -> sampleIndex[i] = binarySearch(localTCB->Chunk,0,localTCB->ChunkSize -1, localTCB->selectedPivot[i]);
}
for (int i = 0; i<localTCB -> num_threads; ++i)
{
if(i == 0) localTCB -> eachStartIndex[i] = 0;
else localTCB -> eachStartIndex[i] = localTCB -> sampleIndex[i-1];
}
for (int i = 0; i<localTCB -> num_threads; ++i)
{
if (i == 0) localTCB -> passLength[i] = localTCB -> sampleIndex[i];
localTCB -> passLength[i] = localTCB -> sampleIndex[i] - localTCB -> sampleIndex[i-1];
if (i == (localTCB -> num_threads)-1) localTCB -> passLength[i] = localTCB -> ChunkSize - localTCB -> sampleIndex[i-1];
}
BARRIER;
/* Phase 4 */
MASTER {
for (int i = 0; i<localTCB -> num_threads; ++i)
{
for (int j = 0; j<localTCB -> num_threads; ++j)
{
// each temp merge space = localTCB[i].tmpMergeSpace[j]
// each cpy start = localTCB[j].Chunk[localTCB[j].eachStartIndex[i]]
// each cpy length = localTCB[j].passLength[i] * sizeof(long int)
localTCB[i].tmpMergeSpace[j] = (long int*) malloc(sizeof(long int)*localTCB[j].passLength[i]);
localTCB[i].mergeLength[j] = localTCB[j].passLength[i];
memcpy(localTCB[i].tmpMergeSpace[j], &(localTCB[j].Chunk[localTCB[j].eachStartIndex[i]]), (localTCB[j].passLength[i]) * sizeof(long int));
}
localTCB[i].resultArr = (long int *) malloc(sizeof(long int)*sumTotal(localTCB[i].mergeLength, localTCB -> num_threads));
}
}
BARRIER;
multimerge(localTCB->tmpMergeSpace,localTCB->mergeLength,localTCB -> num_threads,localTCB->resultArr);
BARRIER;
//Timing
return NULL;
} /* mySPMDMain*/
and the TCB struct is this:
typedef struct ThreadControlBlock {
long int *Chunk;
int *passLength;
int *sampleIndex;
int *eachStartIndex;
long int **tmpMergeSpace;
long int *pivotArray;
long int *selectedPivot;
long int *samples;
int *mergeLength;
long int * resultArr;
int pid;
int N;
int num_threads;
int ChunkSize;
int offSet;
} TCB;
I used valgrind
to check if there's any memory leak. valgrind
returns that this program freed memory after execution but has a invalid read of size 4 when executing thread function on the main thread (with pid==0
):
==6150== Memcheck, a memory error detector
==6150== Copyright (C) 2002-2015, and GNU GPL'd, by Julian Seward et al.
==6150== Using Valgrind-3.11.0 and LibVEX; rerun with -h for copyright info
==6150== Command: ./psrs 8000000 8
==6150==
==6150== Thread 8:
==6150== Invalid read of size 4
==6150== at 0x400DC2: mySPMDMain (psrs.c:169)
==6150== by 0x4E3ADC4: start_thread (in /usr/lib64/libpthread-2.17.so)
==6150== by 0x514673C: clone (in /usr/lib64/libc-2.17.so)
==6150== Address 0x911cd5c is 4 bytes before a block of size 28 alloc'd
==6150== at 0x4C29975: calloc (vg_replace_malloc.c:711)
==6150== by 0x401188: main (psrs.c:48)
==6150==
==6150== Invalid read of size 4
==6150== at 0x400DBE: mySPMDMain (psrs.c:169)
==6150== by 0x4E3ADC4: start_thread (in /usr/lib64/libpthread-2.17.so)
==6150== by 0x514673C: clone (in /usr/lib64/libc-2.17.so)
==6150== Address 0x911cd7c is 0 bytes after a block of size 28 alloc'd
==6150== at 0x4C29975: calloc (vg_replace_malloc.c:711)
==6150== by 0x401188: main (psrs.c:48)
==6150==
==6150== Thread 1:
==6150== Invalid read of size 4
==6150== at 0x400DC2: mySPMDMain (psrs.c:169)
==6150== by 0x401266: main (psrs.c:64)
==6150== Address 0x911a89c is 4 bytes before a block of size 28 alloc'd
==6150== at 0x4C29975: calloc (vg_replace_malloc.c:711)
==6150== by 0x401188: main (psrs.c:48)
==6150==
==6150== Invalid read of size 4
==6150== at 0x400DBE: mySPMDMain (psrs.c:169)
==6150== by 0x401266: main (psrs.c:64)
==6150== Address 0x911a8bc is 0 bytes after a block of size 28 alloc'd
==6150== at 0x4C29975: calloc (vg_replace_malloc.c:711)
==6150== by 0x401188: main (psrs.c:48)
==6150==
==6150==
==6150== HEAP SUMMARY:
==6150== in use at exit: 0 bytes in 0 blocks
==6150== total heap usage: 174 allocs, 174 frees, 320,014,664 bytes allocated
==6150==
==6150== All heap blocks were freed -- no leaks are possible
==6150==
==6150== For counts of detected and suppressed errors, rerun with: -v
==6150== ERROR SUMMARY: 16 errors from 4 contexts (suppressed: 0 from 0)
line 169: localTCB -> passLength[i] = localTCB -> sampleIndex[i] - localTCB -> sampleIndex[i-1];
line 63: myTCB[0].pid = 0;
line 48: myTCB[i].sampleIndex = calloc((NUM_THREADS-1), sizeof(int));
In this test run, NUM_THREADS
is 8
. If read correctly, myTCB[i].sampleIndex = calloc((NUM_THREADS-1), sizeof(int));
should read a memory of 4 * (8-1) = 28 bytes, I don't quite get where should this 4 bytes invalid read before this 28 bytes block.
Upvotes: 0
Views: 2166
Reputation: 1557
@Honza is correct, take a look at these lines:
for (int i = 0; i<localTCB->num_threads; ++i)
{
if (i == 0) localTCB->passLength[i] = localTCB->sampleIndex[i];
localTCB->passLength[i] = localTCB->sampleIndex[i] - localTCB->sampleIndex[i-1];
if (i == (localTCB->num_threads)-1) localTCB->passLength[i] = localTCB->ChunkSize - localTCB->sampleIndex[i-1];
}
The branching here is not safe. when i == 0
, for sure it will execute the line under if (i ==0)
condition. BUT this loop will always execute the middle line! Then localTCB->sampleIndex[i-1]
will read the int
from localTCB->sampleIndex[-1]
which is the int byte before the entire sampleIndex
memory. Since this byte is also 0
(not used or populated), the result of the program is still correct, but obviously the "wrong way" for correct result. valgrind
is strict and it will complain. And when i == (localTCB->num_threads)-1
, the middle line will get the wrong value (0
) too, but the final result will be corrected by the last condition. It means the branching is neither correct and efficient. How to change it? Simple, regulate each of the condition should be fine:
for (int i = 0; i<localTCB->num_threads; ++i)
{
if (i == 0) localTCB->passLength[i] = localTCB->sampleIndex[i];
else if (i > 0 && i<(localTCB->num_threads)-1) localTCB->passLength[i] = localTCB->sampleIndex[i] - localTCB->sampleIndex[i-1];
else if (i == (localTCB->num_threads)-1) localTCB->passLength[i] = localTCB->ChunkSize - localTCB->sampleIndex[i-1];
}
Upvotes: 1
Reputation: 1193
Your line 169 is inside a for loop iterating with i
from 0 up, but you are accessing element sampleIndex[i-1]
. In the first iteration, this access is out of bounds.
Upvotes: 2