Reputation: 872

OpenMP Sudoku solver - Parallelize algorithm with openMP and/or MPI

I created a sudoku solver in c and so far it's working.
I'm reading a txt file in and parse it into a 9x9 int array: int sudoku[9][9] I implemented a simple brute force backtracking algorithm where I check every position. If I need to assign a number I loop through the values 1 to 9 and check if they are valid. If they fit I move to the next index.
I need to parallelize the algorithm to work with multiple processors using MPI, but I don't really know where and how to start.

Now I tried to parallelize the execution with OpenMP tasks first. I want to parallelize the check for possible numbers, ideally creating a task for every number that needs to be checked.

My current approach:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <omp.h>
#include <time.h>
#include <unistd.h>

#define SIZE 9
#define UNASSIGNED 0

//  compile with 
//                              gcc -fopenmp <filename>.c -o <name>
//Optional set num of threads:  export OMP_NUM_THREADS=4
//                              ./<name>
clock_t start, end;

void print_grid(int grid[SIZE][SIZE]) {
    for (int row = 0; row < SIZE; row++) {
        for (int col = 0; col < SIZE; col++) {
            printf("%2d", grid[row][col]);
        }
        printf("\n");
    }
}

//https://stackoverflow.com/questions/1726302/removing-spaces-from-a-string-in-c
void remove_spaces(char* s) {
    const char* d = s;
    do {
        while (*d == ' ') {
            ++d;
        }
    } while (*s++ = *d++);
}

int is_exist_row(int grid[SIZE][SIZE], int row, int num){
    for (int col = 0; col < 9; col++) {
        if (grid[row][col] == num) {
            return 1;
        }
    }
    return 0;
}

int is_exist_col(int grid[SIZE][SIZE], int col, int num) {
    for (int row = 0; row < 9; row++) {
        if (grid[row][col] == num) {
            return 1;
        }
    }
    return 0;
}

int is_exist_box(int grid[SIZE][SIZE], int startRow, int startCol, int num) {
    for (int row = 0; row < 3; row++) {
        for (int col = 0; col < 3; col++) {
            if (grid[row + startRow][col + startCol] == num) {
                return 1;
            } 
        }
    }
    return 0;
}

int is_safe_num(int grid[SIZE][SIZE], int row, int col, int num) {
    return !is_exist_row(grid, row, num) 
            && !is_exist_col(grid, col, num) 
            && !is_exist_box(grid, row - (row % 3), col - (col %3), num);
}

int find_unassigned(int grid[SIZE][SIZE], int *row, int *col) {
    for (*row = 0; *row < SIZE; (*row)++) {
        for (*col = 0; *col < SIZE; (*col)++) {
            if (grid[*row][*col] == 0) {
                return 1;
            }
        }
    }
    return 0;
}


int solve(int grid[SIZE][SIZE]) {
    
    int row = 0;
    int col = 0;
    
    if (!find_unassigned(grid, &row, &col)){
        return 1;
    }
    
    for (int num = 1; num <= SIZE; num++ ) {        
        if (is_safe_num(grid, row, col, num)) {
            int val = 0;
            #pragma omp task firstprivate(grid, row, col,val,num)
            {
                int copy_grid[SIZE][SIZE];
                for (int row = 0; row < SIZE; row++) {
                    for (int col = 0; col < SIZE; col++) {                      
                        copy_grid[row][col] = grid[row][col];
                    }                   
                }
                
                copy_grid[row][col] = num;              
                val = solve(copy_grid);
                
                if(val) {
                    print_grid(copy_grid);
                    end = clock();
                    double time_spent = (double)(end - start) / CLOCKS_PER_SEC;  
                    printf("\nGelöst in %f s\n",time_spent);                    
                    exit(0);                    
                }
            }                       
            
            if (val) {
                return 1;
            }
            
            grid[row][col] = UNASSIGNED;
            #pragma omp taskwait
        }
    }
    
    return 0;
}

int main(int argc, char** argv) {
    
    int sudoku[SIZE][SIZE];
    
    FILE *file;
    char *line = NULL;
    size_t len = 0;
    ssize_t read;
    
    int i,j;
    i =0;
    file = fopen("Test_Sudoku_VeryDifficult.txt","r");
    if(file) {
        while(read = getline(&line, &len,file) != -1) {
            
            remove_spaces(line);                        
            
            for(j = 0; j < SIZE; j++) {
                sudoku[i][j] = (int)line[j] - '0';  //char to int           
            }   
            
            i++;
        }
        fclose(file);
        if (line) free(line);
        
        printf("Size: %d", SIZE);   
        printf("\n");
        
        start = clock();
        printf("Solving Sudoku: \n");
        print_grid(sudoku);
        printf("---------------------\n");
       #pragma omp parallel sections 
       {
           #pragma omp section
           {
               solve(sudoku);   
           }
       }
        exit(EXIT_SUCCESS);
    } else {
        exit(EXIT_FAILURE);
    }
}

I took inspiration from this stackoverflow post

My Sudoku File looks like this

0 0 0 0 0 0 0 1 0 
4 0 0 0 0 0 0 0 0 
0 2 0 0 0 0 0 0 0 
0 0 0 0 5 0 4 0 7
0 0 8 0 0 0 3 0 0 
0 0 1 0 9 0 0 0 0 
3 0 0 4 0 0 2 0 0 
0 5 0 1 0 0 0 0 0 
0 0 0 8 0 6 0 0 0

When I compile and execute this it works, but the Tasks do not get executed parallel. Every task waits for completion of his subtask before going to the next. If I remove the #pragma omp taskwait the program doesn't work correctly and the output is wrong. I added a exit(0) when printing the result because otherwise the program continues to explore other paths. I don't really know what I need to change in order to improve.

I also want to parallelize the algorithm with multiprocessing with MPI, But I don't know where to start. I'm grateful for any help or resources that point me in the right direction.

Thanks in advance for any help or suggestions on how to improve.

Upvotes: 1

Answers (2)

Laci

Reputation: 2818

If you wish to increase the speed of your openMP program you have to:

place #pragma omp taskwait after the for loop not inside (as also suggested and explained by @Victor)
reduce the number of tasks spawned. Your program encounters #pragma omp task directive 26590294 times. The workload of a single task is therefore too small and the overhead of task creation become significant compared to the workload of a task. Of course, the OpenMP runtime will not start 26 million tasks, it merges them, but anyway it will cause overhead.

To reduce the number of tasks created (which also means the increase of the workload of a task), you can use either the final or if clause in #pragma omp task directive. I have introduced a new variable level to keep track of recursion depth. On my computer (and also on Godbolt) using final(level>1) gives the fastest runtime, but it depends both on the number of threads used and on the actual sudoku solved. So the modified program is:

int solve(int grid[SIZE][SIZE], int level) {                  
    ....
    for (int num = 1; num <= SIZE; num++ ) {   
            ....
            #pragma omp task firstprivate(grid,row,col,val,num,level) final(level>1)
            {           
                .... 
                val = solve(copy_grid, level+1);
                .....
            }                                       
            ....                        
    }
    #pragma omp taskwait
    return 0;
}

In main it is better to use #pragma omp single nowait or #pragma omp master (as already suggested):

#pragma omp parallel
#pragma omp single nowait
{
   solve(sudoku,1);   
}

Godbolt link is here.

Upvotes: 0

Victor Eijkhout

Reputation: 5810

Inside your solve call there is a loop over (effectively) all the numbers that can still be placed. In each iteration you create a task and then wait for it. That means you are completely sequential.

Instead of the wait for each task, put a taskgroup before the loop, so that the iterations become spawned in parallel and finish as a group.

But when you get this working, you'll find that the parallel version takes much longer to run than the sequential, because the sequential version stops when you find a solution, and the parallel version traverses the whole search space. And breaking out of a parallel search is hard. It's the same problem that you can not parallelize a while loop with OpenMP.

OpenMP 4 to the rescue: you can cancel a taskgroup, effectively breaking out of it.

Coincidentally, I did a very similar programming exercise a while back: https://pages.tacc.utexas.edu/~eijkhout/pcse/html/omp-examples.html#Depth-firstsearch

Doing this with MPI is even harder, partly because synchronization is very costly. (In fact, you'd probably have to blow up your sudoko to a 100x100 version to overcome the overhead.) The easiest solution is probably to divide the search space on the top level.

Upvotes: 2

OpenMP Sudoku solver - Parallelize algorithm with openMP and/or MPI

Answers (2)

Related Questions