Reputation: 526
Main program: Start a certain amount of child processes then send SIGINT
right away.
int main()
{
pid_t childs[CHILDS];
char *execv_argv[3];
int n = CHILDS;
execv_argv[0] = "./debugging_procs/wait_time_at_interrupt";
execv_argv[1] = "2";
execv_argv[2] = NULL;
for (int i = 0; i < n; i++)
{
childs[i] = fork();
if (childs[i] == 0)
{
execv(execv_argv[0], execv_argv);
if (errno != 0)
perror(strerror(errno));
_exit(1);
}
}
if (errno != 0)
perror(strerror(errno));
// sleep(1);
for (int i = 0; i < n; i++)
kill(childs[i], SIGINT);
if (errno != 0)
perror(strerror(errno));
// Wait for all children.
while (wait(NULL) > 0);
return 0;
}
Forked program: Wait for any signal, if SIGINT is sent, open a certain file and write SIGINT and the current pid to it and wait the amount specified of seconds (in this case, I send 2 from the main program).
#include <signal.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <string.h>
void sigint_handler(int signum)
{
int fd = open("./aux/log1", O_WRONLY | O_APPEND);
char buf[124];
(void)signum;
sprintf(buf, "SIGINT %d\n", getpid());
write(fd, buf, strlen(buf));
close(fd);
}
int main(int argc, char **argv)
{
int wait_time;
wait_time = (argv[1]) ? atoi(argv[1]) : 5;
signal(SIGINT, &sigint_handler);
// Wait for any signal.
pause();
sleep(wait_time);
return 0;
}
The problem is, that the log file that the children should write, doesn't have n
lines, meaning that not all children wrote to it. Sometimes nobody writes anything and the main program doesn't wait
at all (meaning that sleep()
isn't called in this case).
But if I uncomment sleep(1)
in the main program, everything works just as I expected.
I suspect that the child processes don't get enough time to listen to SIGINT
.
The program I'm working on is a task control and when I run a command like:
restart my_program; restart my_program
I get an unstable behaviour. When I call restart, a SIGINT
is sent, then a new fork()
is called then another SIGINT
is sent, just like the example above.
How can I make sure all children will parse SIGINT
without the sleep(1)
line? I'm testing my program if it can handle programs that don't exit right away after SIGINT is sent.
If I add for example, printf("child process started\n");
at the top of the child program, it doesn't get printed and the main program doesn't wait for anything, unless I sleep
for a second. This happens even with only 1 child process.
Upvotes: 1
Views: 1825
Reputation: 39326
Everything is working as it should. Some of your child processes get killed by the signal, before they set up the signal handler, or even before they start executing the child binary.
In your parent process, instead of just wait()
ing until there are no more child processes, you could examine the identity and exit status of each of the processes reaped. Replace while (wait(NULL) > 0);
with
{
pid_t p;
int status;
while ((p = wait(&status)) > 0) {
if (WIFEXITED(status))
printf("Child %ld exit status was %d.\n", (long)p, WEXITSTATUS(status));
else
if (WIFSIGNALED(status))
printf("Child %ld was killed by signal %d.\n", (long)p, WTERMSIG(status));
else
printf("Child %ld was lost.\n", (long)p);
fflush(stdout);
}
}
and you'll see that the "missing" child processes were terminated by the signals. This means that the child process was killed before it was ready to catch the signal.
I wrote my own example program pairs, with complete error checking. Instead of a signal handler, I decided to use sigprocmask()
and sigwaitinfo()
, just to show another way to do the same thing (and to not be limited to async-signal safe functions in a signal handler).
parent.c:
#define _POSIX_C_SOURCE 200809L
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <signal.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
const char *signal_name(const int signum)
{
static char buffer[32];
switch (signum) {
case SIGINT: return "INT";
case SIGHUP: return "HUP";
case SIGTERM: return "TERM";
default:
snprintf(buffer, sizeof buffer, "%d", signum);
return (const char *)buffer;
}
}
static int compare_pids(const void *p1, const void *p2)
{
const pid_t pid1 = *(const pid_t *)p1;
const pid_t pid2 = *(const pid_t *)p2;
return (pid1 < pid2) ? -1 :
(pid1 > pid2) ? +1 : 0;
}
int main(int argc, char *argv[])
{
size_t count, r, i;
int status;
pid_t *child, *reaped, p;
char dummy;
if (argc < 3 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: %s [ -h | --help ]\n", argv[0]);
fprintf(stderr, " %s COUNT PATH-TO-BINARY [ ARGS ... ]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "This program will fork COUNT child processes,\n");
fprintf(stderr, "each child process executing PATH-TO-BINARY.\n");
fprintf(stderr, "Immediately after all child processes have been forked,\n");
fprintf(stderr, "they are sent a SIGINT signal.\n");
fprintf(stderr, "\n");
return EXIT_FAILURE;
}
if (sscanf(argv[1], " %zu %c", &count, &dummy) != 1 || count < 1) {
fprintf(stderr, "%s: Invalid count.\n", argv[1]);
return EXIT_FAILURE;
}
child = malloc(count * sizeof child[0]);
reaped = malloc(count * sizeof reaped[0]);
if (!child || !reaped) {
fprintf(stderr, "%s: Count is too large; out of memory.\n", argv[1]);
return EXIT_FAILURE;
}
for (i = 0; i < count; i++) {
p = fork();
if (p == -1) {
if (i == 0) {
fprintf(stderr, "Cannot fork child processes: %s.\n", strerror(errno));
return EXIT_FAILURE;
} else {
fprintf(stderr, "Cannot fork child %zu: %s.\n", i + 1, strerror(errno));
count = i;
break;
}
} else
if (!p) {
/* Child process */
execvp(argv[2], argv + 2);
{
const char *errmsg = strerror(errno);
fprintf(stderr, "Child process %ld: Cannot execute %s: %s.\n",
(long)getpid(), argv[2], errmsg);
exit(EXIT_FAILURE);
}
} else {
/* Parent process. */
child[i] = p;
}
}
/* Send all children the INT signal. */
for (i = 0; i < count; i++)
kill(child[i], SIGINT);
/* Reap and report each child. */
r = 0;
while (1) {
p = wait(&status);
if (p == -1) {
if (errno == ECHILD)
break;
fprintf(stderr, "Error waiting for child processes: %s.\n", strerror(errno));
return EXIT_FAILURE;
}
if (r < count)
reaped[r++] = p;
else
fprintf(stderr, "Reaped an extra child process!\n");
if (WIFEXITED(status)) {
switch (WEXITSTATUS(status)) {
case EXIT_SUCCESS:
printf("Parent: Reaped child process %ld: EXIT_SUCCESS.\n", (long)p);
break;
case EXIT_FAILURE:
printf("Parent: Reaped child process %ld: EXIT_FAILURE.\n", (long)p);
break;
default:
printf("Parent: Reaped child process %ld: Exit status %d.\n", (long)p, WEXITSTATUS(status));
break;
}
fflush(stdout);
} else
if (WIFSIGNALED(status)) {
printf("Parent: Reaped child process %ld: Terminated by %s.\n", (long)p, signal_name(WTERMSIG(status)));
fflush(stdout);
} else {
printf("Parent: Reaped child process %ld: Lost.\n", (long)p);
fflush(stdout);
}
}
if (r == count) {
/* Sort both pid arrays. */
qsort(child, count, sizeof child[0], compare_pids);
qsort(reaped, count, sizeof reaped[0], compare_pids);
for (i = 0; i < count; i++)
if (child[i] != reaped[i])
break;
if (i == count)
printf("Parent: All %zu child processes were reaped successfully.\n", count);
}
return EXIT_SUCCESS;
}
child.c:
#define _POSIX_C_SOURCE 200809L
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <signal.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
const char *signal_name(const int signum)
{
static char buffer[32];
switch (signum) {
case SIGINT: return "INT";
case SIGHUP: return "HUP";
case SIGTERM: return "TERM";
default:
snprintf(buffer, sizeof buffer, "%d", signum);
return (const char *)buffer;
}
}
int main(void)
{
const long mypid = getpid();
sigset_t set;
siginfo_t info;
int result;
printf("Child: Child process %ld started!\n", mypid);
fflush(stdout);
sigemptyset(&set);
sigaddset(&set, SIGINT);
sigaddset(&set, SIGHUP);
sigaddset(&set, SIGTERM);
sigprocmask(SIG_BLOCK, &set, NULL);
result = sigwaitinfo(&set, &info);
if (result == -1) {
printf("Child: Child process %ld failed: %s.\n", mypid, strerror(errno));
return EXIT_FAILURE;
}
if (info.si_pid == 0)
printf("Child: Child process %ld terminated by signal %s via terminal.\n", mypid, signal_name(result));
else
if (info.si_pid == getppid())
printf("Child: Child process %ld terminated by signal %s sent by the parent process %ld.\n",
mypid, signal_name(result), (long)info.si_pid);
else
printf("Child: Child process %ld terminated by signal %s sent by process %ld.\n",
mypid, signal_name(result), (long)info.si_pid);
return EXIT_SUCCESS;
}
Compile both using e.g.
gcc -Wall -O2 parent.c -o parent
gcc -Wall -O2 child.c -o child
and run them using e.g.
./parent 100 ./child
where the 100
is the number of child processes to fork, each running ./child
.
Errors are output to standard error. Each line from parent to standard output begins with Parent:
, and each line from any child to standard output begins with Child:
.
On my machine, the last line in the output is always Parent: All # child processes were reaped successfully.
, which means that every child process fork()
ed, was reaped and reported using wait()
. Nothing was lost, and there were no issues with fork()
and kill()
.
(Do note that if you specify more child processes than you are allowed to fork, the parent program does not consider that an error, and just uses the allowed number of child processes for the test.)
On my machine, forking and reaping 100 child processes is enough work for the parent process, so that every child process gets to the part where it is ready to catch the signal.
On the other hand, the parent can handle 10 child processes (running ./parent 10 ./child
) so fast that every one of the child processes gets killed by the INT signal before they are ready to handle the signal.
Here is the output from a pretty typical case when running ./parent 20 ./child
:
Child: Child process 19982 started!
Child: Child process 19983 started!
Child: Child process 19984 started!
Child: Child process 19982 terminated by signal INT sent by the parent process 19981.
Child: Child process 19992 started!
Child: Child process 19983 terminated by signal INT sent by the parent process 19981.
Child: Child process 19984 terminated by signal INT sent by the parent process 19981.
Parent: Reaped child process 19982: EXIT_SUCCESS.
Parent: Reaped child process 19985: Terminated by INT.
Parent: Reaped child process 19986: Terminated by INT.
Parent: Reaped child process 19984: EXIT_SUCCESS.
Parent: Reaped child process 19987: Terminated by INT.
Parent: Reaped child process 19988: Terminated by INT.
Parent: Reaped child process 19989: Terminated by INT.
Parent: Reaped child process 19990: Terminated by INT.
Parent: Reaped child process 19991: Terminated by INT.
Parent: Reaped child process 19992: Terminated by INT.
Parent: Reaped child process 19993: Terminated by INT.
Parent: Reaped child process 19994: Terminated by INT.
Parent: Reaped child process 19995: Terminated by INT.
Parent: Reaped child process 19996: Terminated by INT.
Parent: Reaped child process 19983: EXIT_SUCCESS.
Parent: Reaped child process 19997: Terminated by INT.
Parent: Reaped child process 19998: Terminated by INT.
Parent: Reaped child process 19999: Terminated by INT.
Parent: Reaped child process 20000: Terminated by INT.
Parent: Reaped child process 20001: Terminated by INT.
Parent: All 20 child processes were reaped successfully.
Of the 20 child processes, 16 were killed by INT signal before they executed the first printf()
(or fflush(stdout)
) line. (We could add a printf("Child: Child process %ld executing %s\n", (long)getpid(), argv[2]); fflush(stdout);
to parent.c just before the execvp()
line, to see if any of the child processes get killed before they execute at all.)
Of the four remaining child processes (19982, 19983, 19984, and 19992), one (19982) was terminated after the first printf()
or fflush()
, but before it managed to run setprocmask()
, which blocks the signal and prepares the child for catching it.
Only those three remaining child processes (19983, 19984, and 19992) caught the INT signal sent by the parent process.
As you can see, just adding complete error checking, and adding sufficient output (and fflush(stdout);
where useful, as standard output is buffered by default), lets you run several test cases, and construct a much better overall picture of what is happening.
The program I'm working on is a task control and when I run a command like: restart my_program; restart my_program I get an unstable behaviour. When I call restart, a SIGINT is sent, then a new fork() is called then another SIGINT is sent, just like the example above.
In that case, you are sending the signal before the new fork is ready, so the default disposition of the signal (Termination, for INT) defines what happens.
The solutions to this underlying problem vary. Note that it is at the core of many init system issues. It is easy to solve if the child (my_program
here) co-operates, but difficult in all other cases.
One simple co-operation method is to have the child send a signal to its parent process, whenever it is ready for action. To avoid killing parent processes that are unprepared for such information, a signal that is ignored by default (SIGWINCH
, for example) can be used.
The option of sleeping for some duration, so that the new child process has enough time to become ready for action, is a common, but pretty unreliable method of mitigating this issue. (In particular, the required duration depends on the child process priority, and the overall load on the machine.)
Upvotes: 1
Reputation: 3418
Try using the waitpid() command in the for loop. This way the next child will only write once the first child is done
Upvotes: 1