Reputation: 187
I am using QThreadPool to run a worker that has function to create then clear huge QVector and write huge size of file. However, every time one worker reach that lines (QVector::clear/QFile::close) all the threads got freeze and will continue when it is finished.
Is someone has any suggestion to overcome this situation ? To have other threads still capable to run normal when in one of the worker run those two functions. For QFile::close, I tried to use QFile::flush in my iteration instead of close() in the end of iterations, but it's not helping the performance.
This is the codes when the thread getting slower when clearing the vector
main.cpp
#include "mainwindow.h"
#include <QApplication>
int main(int argc, char *argv[])
{
QApplication a(argc, argv);
MainWindow w;
w.show();
return a.exec();
}
mainwindow.h
#ifndef MAINWINDOW_H
#define MAINWINDOW_H
#include <QMainWindow>
namespace Ui {
class MainWindow;
}
class MainWindow : public QMainWindow
{
Q_OBJECT
public:
explicit MainWindow(QWidget *parent = nullptr);
~MainWindow();
private slots:
void on_start_pushButton_clicked();
private:
Ui::MainWindow *ui;
};
#endif // MAINWINDOW_H
mainwindow.cpp
#include "mainwindow.h"
#include "ui_mainwindow.h"
#include "worker.h"
#include <QDebug>
#include <QSharedPointer>
#include <QThread>
#include <QThreadPool>
MainWindow::MainWindow(QWidget *parent) :
QMainWindow(parent),
ui(new Ui::MainWindow)
{
ui->setupUi(this);
on_start_pushButton_clicked();
}
MainWindow::~MainWindow()
{
delete ui;
}
void MainWindow::on_start_pushButton_clicked()
{
int numProcess = 20;
int numTraces = 10000;
int numSamps = 8680;
qDebug() << "main" << QThread::currentThread();
QThreadPool *pool = QThreadPool::globalInstance();
for (int i=0; i<numProcess; i++) {
worker *w= new worker;
w->setAutoDelete(true);
w->setData(i+1, numTraces, numSamps);
pool->start(w);
}
}
mainwindow.ui
<?xml version="1.0" encoding="UTF-8"?>
<ui version="4.0">
<class>MainWindow</class>
<widget class="QMainWindow" name="MainWindow">
<property name="geometry">
<rect>
<x>0</x>
<y>0</y>
<width>400</width>
<height>300</height>
</rect>
</property>
<property name="windowTitle">
<string>MainWindow</string>
</property>
<widget class="QWidget" name="centralWidget">
<widget class="QPushButton" name="start_pushButton">
<property name="geometry">
<rect>
<x>240</x>
<y>50</y>
<width>75</width>
<height>23</height>
</rect>
</property>
<property name="text">
<string>Start</string>
</property>
</widget>
</widget>
</widget>
<layoutdefault spacing="6" margin="11"/>
<resources/>
<connections/>
</ui>
worker.h
#ifndef WORKER_H
#define WORKER_H
#include <QObject>
#include <QRunnable>
#include <QThread>
class worker : public QObject, public QRunnable
{
Q_OBJECT
public:
explicit worker(QObject *parent = nullptr) : QObject(parent), QRunnable () {}
~worker() {}
void setData(int id, int numTraces, int numSamps);
void run();
signals:
public slots:
private:
void clearVector();
int id, numTraces, numSamps;
};
#endif // WORKER_H
worker.cpp
#include "worker.h"
#include <QCoreApplication>
#include <QDebug>
#include <QVector>
void worker::setData(int id1, int numTraces, int numSamps)
{
this->id = id1;
this->numTraces = numTraces;
this->numSamps = numSamps;
qDebug() << "setData" << id << numTraces << numSamps;
}
void worker::run()
{
clearVector();
qDebug() << "pool finished" << id << numTraces << numSamps << QThread::currentThread();
}
void worker::clearVector()
{
QVector<QVector<float>> traces1, traces2;
float progressWaypoint = 0.01f*numTraces;
int progressPos = 0;
for (int i=0; i<numTraces; i++) {
QVector<float> trace1, trace2;
for (int j=0; j<numSamps; j++) {
trace1.append(float(j));
trace2.append(float(numSamps - j));
}
traces1.append(trace1);
traces2.append(trace2);
if (numTraces <= 100) {
QCoreApplication::processEvents();
}
else {
if (i + 1 >= round(progressWaypoint*progressPos)) {
QCoreApplication::processEvents();
qDebug() << id << QThread::currentThread() << progressPos;
progressPos++;
}
}
}
traces1.clear();
traces2.clear();
}
Upvotes: 2
Views: 266
Reputation: 4869
Interesting problem. Testing on Windows, Qt 5.12.4.
One thing I've determined so far is that std::vector
seems to perform better in this case. But it's still quite a long time, and does affect other threads on the system, making the UI only somewhat-responsive. But better than QVector
.
Also, these are large numbers and require significant memory. On my 32-bit MinGw build it crashes with out of memory errors when I try to use > 2 threads. The tests were therefore done with 64b MSVC2017. Test machine has 8-cores @ 3.? GHz w/64GB RAM.
Here are some timing results (code used to generate this is below):
1 worker with 2 `std::vector`s:
Worker 1 finished (ms) 1648
Last worker finished after 1649 total ms.
5 workers with 2 `std::vector`s:
Worker 1 finished (ms) 44363
Worker 2 finished (ms) 44386
Worker 3 finished (ms) 44388
Worker 4 finished (ms) 44401
Worker 5 finished (ms) 44448
Last worker finished after 44449 total ms.
10 workers with 2 `std::vector`s:
Worker 4 finished (ms) 84910
Worker 7 finished (ms) 92701
Worker 2 finished (ms) 111590
Worker 8 finished (ms) 144678
Worker 9 finished (ms) 145378
Worker 5 finished (ms) 169067
Worker 3 finished (ms) 211629
Worker 1 finished (ms) 220098
Worker 10 finished (ms) 249356
Worker 6 finished (ms) 253452
Last worker finished after 253453 total ms.
1 worker with 2 `QVector`s:
Worker 1 finished (ms) 1871
Last worker finished after 1872 total ms.
5 workers with 2 `QVector`s:
Worker 1 finished (ms) 36492
Worker 3 finished (ms) 58157
Worker 5 finished (ms) 79132
Worker 2 finished (ms) 84612
Worker 4 finished (ms) 84819
Last worker finished after 84820 total ms.
10 workers with 2 `QVector`s:
Worker 7 finished (ms) 234770
Worker 8 finished (ms) 247531
Worker 9 finished (ms) 261346
Worker 1 finished (ms) 261924
Worker 4 finished (ms) 270520
Worker 2 finished (ms) 275740
Worker 10 finished (ms) 290605
Worker 3 finished (ms) 293575
Worker 6 finished (ms) 296074
Worker 5 finished (ms) 296249
Last worker finished after 296361 total ms.
At a certain point between 5 and 10 threads even std::vector
seems to start "tripping over itself." This is also obvious in GUI responsiveness (somewhat responsive at 5, hardly at all at 10).
As mentioned in the comments of the OP, the delay happens during de-allocation of the large vectors traces1
and traces2
, not, apparently, during clear()
(or swap()
for that matter). But the only way to determine this is with a debugger because once it hits the end of the clearVector()
function the thread is essentially hung up (trying to timestamp this with a timer is useless).
I also tried using only 1 vector "set" inside Worker
(see code). Huge difference:
10 workers with 1 `std::vector`:
Worker 5 finished (ms) 4125
Worker 4 finished (ms) 4139
Worker 1 finished (ms) 4141
Worker 6 finished (ms) 4153
Worker 10 finished (ms) 4161
Worker 9 finished (ms) 4177
Worker 7 finished (ms) 4197
Worker 3 finished (ms) 4216
Worker 8 finished (ms) 4209
Worker 2 finished (ms) 4221
Last worker finished after 4222 total ms.
10 workers with 1 `QVector`:
Worker 10 finished (ms) 4308
Worker 2 finished (ms) 4358
Worker 1 finished (ms) 4373
Worker 3 finished (ms) 4385
Worker 8 finished (ms) 4391
Worker 4 finished (ms) 4400
Worker 6 finished (ms) 4404
Worker 7 finished (ms) 4401
Worker 5 finished (ms) 4409
Worker 9 finished (ms) 4406
Last worker finished after 4410 total ms.
Here's my test "rig":
#include <QRunnable>
#include <QThread>
#include <QElapsedTimer>
#include <QtWidgets>
#define USE_QVECTOR 0
#define NUM_VECTORS 2
#define USE_CLEAR 0
#define USE_SWAP 0
class Worker : public QObject, public QRunnable
{
Q_OBJECT
public:
#if USE_QVECTOR
typedef QVector<int> vect_t;
typedef QVector<vect_t> vectVect_t;
#else
typedef std::vector<int> vect_t;
typedef std::vector<vect_t> vectVect_t;
#endif
explicit Worker(int id, int traces, int samples, QObject *parent = nullptr) :
QObject(parent), QRunnable(),
id(id), numTraces(traces), numSamps(samples)
{}
void run() override
{
qDebug() << "worker starting" << id << numTraces << numSamps << QThread::currentThread();
emit progressChanged(id, -1);
tim.start();
clearVector();
emit progressChanged(id, tim.elapsed());
}
signals:
void progressChanged(int id, int pos) const;
private:
void clearVector()
{
vectVect_t traces1, traces2;
traces1.reserve(numTraces);
if (NUM_VECTORS > 1)
traces2.reserve(numTraces);
float progressWaypoint = 0.01f * numTraces;
int progressPos = 0;
for (int i=0; i < numTraces; i++) {
vect_t trace1, trace2;
trace1.reserve(numSamps);
if (NUM_VECTORS > 1)
trace2.reserve(numSamps);
for (int j=0; j < numSamps; j++) {
trace1.push_back(j);
if (NUM_VECTORS > 1)
trace2.push_back(numSamps - j);
}
traces1.push_back(trace1);
if (NUM_VECTORS > 1)
traces2.push_back(trace2);
if (i + 1 >= round(progressWaypoint * progressPos))
emit progressChanged(id, progressPos++);
}
qDebug() << "Vectors populated in" << tim.elapsed();
if (USE_CLEAR) {
// Clearing the vectors slows the process down a bit but its not where the delay is.
traces1.clear();
if (NUM_VECTORS > 1)
traces2.clear();
}
if (USE_SWAP) {
// swap is very fast but it doesn't help overall performance
vectVect_t blank;
traces1.swap(blank);
if (NUM_VECTORS > 1)
traces2.swap(blank);
}
}
int id, numTraces, numSamps;
QElapsedTimer tim;
};
int main(int argc, char *argv[]) {
QApplication a(argc, argv);
// UI setup
QDialog d;
d.setLayout(new QVBoxLayout());
QPushButton *pbStart = new QPushButton("Start", &d);
QSpinBox *sbThreads = new QSpinBox(&d);
sbThreads->setValue(5);
QSpinBox *sbTraces = new QSpinBox(&d);
sbTraces->setMaximum(10000);
sbTraces->setValue(10000);
QSpinBox *sbSamps = new QSpinBox(&d);
sbSamps->setMaximum(10000);
sbSamps->setValue(8680);
QHBoxLayout *btnLo = new QHBoxLayout();
btnLo->setSpacing(6);
btnLo->addWidget(pbStart);
btnLo->addWidget(new QLabel("Thrds:", &d));
btnLo->addWidget(sbThreads, 1);
btnLo->addWidget(new QLabel("Traces:", &d));
btnLo->addWidget(sbTraces, 1);
btnLo->addWidget(new QLabel("Samps:", &d));
btnLo->addWidget(sbSamps, 1);
d.layout()->addItem(btnLo);
// Text box for showing results
QTextEdit *e = new QTextEdit(&d);
e->setReadOnly(true);
e->setTextInteractionFlags(Qt::TextBrowserInteraction);
d.layout()->addWidget(e);
QElapsedTimer tim; // total elapsed timer
QVector<int> finished; // keep track of finished workers
// Set up workers on button click.
QObject::connect(pbStart, &QPushButton::clicked, &d, [&]()
{
const int threads = sbThreads->value(),
traces = sbTraces->value(),
samples = sbSamps->value();
QThreadPool *pool = QThreadPool::globalInstance();
//pool->setStackSize(samples * 4 * traces * threads);
qDebug() << "Pool max. threads:" << pool->maxThreadCount() << "Stack size:" << pool->stackSize();
pbStart->setDisabled(true);
finished.clear();
tim.start();
for (int i=0; i < threads; i++) {
Worker *w = new Worker(i+1, traces, samples);
// Show messages on worker progress updates
QObject::connect(w, &Worker::progressChanged, &d, [e, pbStart, threads, &tim, &finished](int id, int pos)
{
const QString msg = QStringLiteral("Worker %1 %2 %3")
.arg(id)
.arg(pos < 0 ? "started" : pos > 100 ? "finished (ms)" : "progress")
.arg(pos);
e->append(msg);
if (pos > 100) {
finished << id;
if (finished.count() == threads) {
e->append(QStringLiteral("Last worker finished after %1 total ms.").arg(tim.elapsed()));
pbStart->setEnabled(true);
}
}
e->ensureCursorVisible();
}, Qt::QueuedConnection);
w->setAutoDelete(true);
pool->start(w);
qDebug() << "Queued worker" << i+1 << "with active thread count:" << pool->activeThreadCount();
}
});
d.show();
return a.exec();
}
#include "main.moc"
ADDED: Using fixed-size arrays instead of vectors. Obviously in real code some care would need to be taken to ensure the array indexes are actually valid. (Also of course one could populate the traces1
and traces2
arrays directly in the inner loop, w/out the intermediate trace1/2
, but NVM that for now. :)
void clearVector()
{
float progressWaypoint = 0.01f * numTraces;
int progressPos = 0;
// volatile to help make sure the compiler isn't just optimizing these out.
volatile int *traces1[10000], *traces2[10000];
for (int i=0; i < numTraces; i++) {
volatile int trace1[10000], trace2[10000];
for (int j=0; j < numSamps; j++) {
trace1[j] = j;
trace2[j] = (numSamps - j);
}
traces1[i] = trace1;
traces2[i] = trace2;
if (i + 1 >= round(progressWaypoint * progressPos))
emit progressChanged(id, progressPos++);
}
// also use a value from the populated arrays to make sure they really exist.
qDebug() << "Vectors populated in" << tim.elapsed() << traces1[0][0] << traces2[5][5];
}
I had to add 100
to the timer number because each thread finishes in < 100ms.
void run() override {
...
clearVector();
emit progressChanged(id, tim.elapsed() + 100);
}
With 20 threads (16 immediate and 4 get queued) and 10K each of "traces" and "samples" I get:
Last worker finished after 332 total ms.
Also this runs no problem on my 32-bit MinGW build with 20 threads. Same exec times.
Upvotes: 2