Reputation: 1
I've developed a program that reads numbers from .txt file where it will store into a vector to undergone a series of combinations and calculations to determine whether the result matches the number that I've wanted. These process will be done in multiple threads, where each thread will be in charge of handling various number of iterations within the parallel for loop.
Long story short, the processing time varies a lot when it comes to large number (e.g. 9 numbers) where the processing time could be as short as 3 minutes or it could be more than 10 minutes.
Here's the benchmark that I've tried so far:
8 numbers serial : 18.119 seconds
8 numbers multithread (first-try): 10.238 seconds
8 numbers multithread (second-try): 18.943 seconds
9 numbers serial : 458.980 seconds
9 numbers multithread (first-try): 172.347 seconds
9 numbers multithread (second-try): 519.532 seconds //Seriously?
//Another try after suggested modifications
9 numbers multithread (first-try): 297.017 seconds
9 numbers multithread (second-try): 297.85 seconds
9 numbers multithread (third-try): 304.755 seconds
9 numbers multithread (fourth-try): 396.391 seconds
So the question is, is there any possible way to improve the program (multi-thread) so that it only requires the least amount of time to shuffle/calculate the numbers?
Here's a portion of the code where parallel for loop occurs (Updated with slight modifications):
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <stdlib.h>
#include <algorithm>
#include <stdio.h>
#include <Windows.h>
#include <omp.h>
#define OPERATORSIZE 3
using namespace std;
int cur_target;
ofstream outFile;
string get_operator(int i) {
switch (i) {
case 0:
return "+";
case 1:
return "-";
case 2:
return "*";
case 3:
return "/";
default:
return "";
}
}
int prev_num_pos(vector<int> &cur_equation, int count) {
for (int i = count - 1; i >= 0; i--) {
if (cur_equation[i] != -1) return i + 1;
}
return 0;
}
bool nextoperator(int k, vector<int> &operator_array) {
for (int i = k - 2; i >= 0; i--) {
if (operator_array[i] < OPERATORSIZE) {
operator_array[i] += 1;
break;
}
else
operator_array[i] = 0;
switch (i) {
case 0:
return false;
}
}
return true;
}
void vector_combination(vector<int> int_list) { // Generate the number combinations from the number list
bool div_remainder = false;
int count = 0;
#pragma omp parallel for schedule(dynamic) firstprivate(div_remainder) reduction(+:count)
for (int i = 0; i < int_list.size(); ++i) {
vector<int> cur_equation, cur_temp, cur_list, operator_array;
auto list = int_list;
rotate(list.begin(), list.begin() + i, list.begin() + i + 1);
do
{
cur_list.clear();
operator_array.clear();
for (auto x : list)
cur_list.push_back(x);
for (int i = 0; i < cur_list.size() - 1; i++)
operator_array.push_back(0);
do
{
div_remainder = false;
count = 0;
cur_equation = operator_array;
cur_temp = cur_list;
for (int i = 0; i < cur_equation.size(); ++i) { // Check for equation priorities
if (cur_equation[i] == 3) {
count = i;
if (cur_temp[count] % cur_temp[count + 1] != 0) {
div_remainder = true;
break;
}
}
}
if (div_remainder)
continue;
for (int i = 0; i < cur_temp.size() - 1; ++i) {
count = -1;
if (cur_equation[i] == 2 || cur_equation[i] == 3) {
count = prev_num_pos(cur_equation, i);
}
else
continue;
if (cur_equation[i] == 2) {
cur_temp[count] *= cur_temp[i + 1];
cur_equation[i] = -1;
}
else if (cur_equation[i] == 3) {
if (cur_temp[i + 1] != 0) {
cur_temp[count] /= cur_temp[i + 1];
cur_equation[i] = -1;
}
else {
div_remainder = true;
break;
}
}
}
if (div_remainder)
continue;
for (int i = 0; i < cur_temp.size() - 1; ++i) {
switch (cur_equation[i]) {
case 0: {
cur_temp[0] += cur_temp[i + 1]; // Addition
cur_equation[i] = -1;
break;
}
case 1: { // Subtraction
cur_temp[0] -= cur_temp[i + 1];
cur_equation[i] = -i;
break;
}
}
}
if (cur_temp[0] == cur_target) {
#pragma omp critical
{
for (int i = 0; i < cur_list.size(); ++i) {
outFile << cur_list[i];
if (i < cur_list.size() - 1) { outFile << get_operator(operator_array[i]); }
}
outFile << "\n";
}
}
} while (nextoperator(cur_list.size(), operator_array));
// Send to function to undergone a list of operator combinations
} while (next_permutation(list.begin() + 1, list.end()));
}
}
int main(void) {
SetPriorityClass(GetCurrentProcess(), HIGH_PRIORITY_CLASS);
vector<int> int_list;
string line;
ifstream myfile("Problem.txt");
if (myfile.is_open()) {
while (getline(myfile, line)) {
int num = stoi(line);
int_list.push_back(num);
cur_target = num;
}
}
else
cout << "Unable to open file." << endl;
myfile.close();
int_list.pop_back();
sort(int_list.begin(), int_list.end());
outFile.open("answer.txt");
vector_combination(int_list);
outFile.close();
int answer_count = 0;
myfile.open("answer.txt");
if (myfile.is_open()) {
while (getline(myfile, line)) {
++answer_count;
if (answer_count > 1)
break;
}
}
myfile.close();
if (answer_count == 0) {
outFile.open("answer.txt");
outFile << "-1" << endl;
}
outFile.close();
return 0;
}
As for the sample input, create a .txt file named "Problem.txt" with random numbers like so (The last number is the targeted result)(Updated with current sample input used for benchmark):
28
55
78
77
33
65
35
62
19
221
The hardware/software specification that the program runs on: Processor: i5 Sandy Bridge 2500K, Ram: 8GB, OS: Windows 10 Professional, IDE: Visual Studio 2015 Enterprise Edition,
Upvotes: 0
Views: 657
Reputation: 1
Apparently the runtime variance was caused by the vectors. I've checked it using performance analyzer and noticed the time spent on copying the values between vectors was not consistent. I've modified it to pointer array instead and the runtime is now improved tremendously and consistent.
Upvotes: 0
Reputation: 22650
Move the #pragma omp critical
inside the if condition. Since cur_temp
is thread private and cur_target
is global read only, it is not necessary to protect the condition with a critical section.
This change drastically minimizes the direct interaction between the threads and, on my system, speeds up the parallel version consistently.
I would weakly guess the performance variations were influenced by the (seemingly random) phase shift between the loops running on different threads.
If performance variation persists, try enabling thread binding. Check the documentation of your OpenMP implementation, look for OMP_PROC_BIND
, "thread pinning", "binding", or "affinity".
Upvotes: 1