Reputation: 133
I cannot write my code without using goto statements. Are they inevitable?
I wrote a code like this:
sequence is a function which prints all sequences that consists of m numbers from 0 to (n - 1) and returns the number of them.
It works as I expected, but I used three labels and three goto statements in it.
I also wrote sequence_substitute using no goto statements, but it is slower than sequence when n > 9.
//this prints all sequences that consist of m numbers from 0 to (n - 1), and return the number of them.
//for example, if m = 2 and n = 3 then it prints "0 0 / 0 1 / 0 2 / 1 0 / 1 1 / 1 2 / 2 0 / 2 1 / 2 2 / " and return 9.
int sequence(int m, int n){
int i[100], j = 0, count = 0;
A:
if(!(j < m)){
for(int k = 0; k < m; ++k) printf("%d ", i[k]);
printf("/ ");
++count;
goto C;
}
i[j] = 0;
B:
if(i[j] < n){
++j;
goto A;
}
C:
--j;
if(j >= 0){
++i[j];
goto B;
}
putchar('\n');
return count;
}
int sequence_substitute(int m, int n){
int i[100], count = 0;
for(int j = 0; j < m; ++j) i[j] = 0;
for(;;){
int j = m - 1;
for(int k = 0; k < m; ++k) printf("%d ", i[k]);
printf("/ ");
++count;
for(;;){
if(i[j] < n - 1){
++i[j];
break;
}else{
if(j == 0){
putchar('\n');
return count;
}else{
i[j] = 0;
--j;
}
}
}
}
}
Are there any ways to write a function as fast as sequence using no goto statements?
Upvotes: 1
Views: 238
Reputation: 37232
First, calculate the number of results. This will be numbers_of_results = pow(n, m)
. E.g. if m = 2
and n = 3
then there will be pow(3, 2) == 9
results.
Once you know how many results there are you can use this for indexing the solution space. For example:
numbers_of_results = pow(n, m);
for(index = 0; index < numbers_of_results; index++) {
}
The index is just a "base m" number, where each digit in the index determines a number. To extract individual digits from the index you can use digit = (index / pow(m, digit_number) ) % m
, but that pow()
can be avoided when you're extracting all digits one at a time.
For example:
unsigned int sequence(unsigned int m, unsigned int n) {
unsigned int numbers_of_results, index, temp, digit_number;
numbers_of_results = pow(n, m);
for(index = 0; index < numbers_of_results; index++) {
temp = index;
for(digit_number = 0; digit_number < n; digit_number++) {
digit = temp % m;
temp /= m;
printf("%d ", digit);
}
printf("/ ");
}
putchar('\n');
return numbers_of_results;
Now; think about the "readability/maintainability vs. performance" compromise and (if you care about performance at all) which CPU/s you care about.
For some CPUs (and not others) it might be better to replace pow()
with your own implementation (designed for integers only). For some CPUs (and not others) it might be beneficial to add special case versions (e.g. for all the cases where m
is a power of 2, and divisions and modulo can be replaced with "shift by constant m" and "AND with constant m-1"). For some CPUs (and not others), division and modulo might be a lot more expensive than branch misprediction costs, so it might improve performance to round m
up to the nearest power of 2 (so you can use one of the "shift and AND" special versions) but then suppress/discarded cases where the index contains a digit that's too large.
Upvotes: 1
Reputation:
I Benchmarked these two functions in the following code.
In this benchmark, (m, n) = (6,15);
#include <stdio.h>
#include <stdlib.h>
double get_current_time();
int sequence(int m, int n);
int sequence_substitute(int m, int n);
double benchmark(int (*method)(int, int), int m, int n) {
double time1 = get_current_time();
method(m, n);
method(m, n);
method(m, n);
method(m, n);
method(m, n);
method(m, n);
method(m, n);
method(m, n);
method(m, n);
method(m, n);
double time2 = get_current_time();
return (time2 - time1) / 10;
}
int main(void) {
const int m = 6;
const int n = 15;
fprintf(stderr, "sequence: %f\n", benchmark(sequence, m, n));
fprintf(stderr, "sequence_substitute: %f\n",
benchmark(sequence_substitute, m, n));
return 0;
}
#if defined(WIN32) || defined(__WIN32) || defined(_WIN32) || \
defined(__WIN32__) || defined(_WIN32_)
#include <windows.h>
double get_current_time() {
LARGE_INTENGER t, f;
QueryPerformanceCounter(&t);
QueryPerformanceFrequency(&f);
return (double)t.QuadPart / (double)f.QuadPart;
}
#else
#include <sys/resource.h>
#include <sys/time.h>
double get_current_time() {
struct timeval t;
gettimeofday(&t, 0);
return t.tv_sec + t.tv_usec * 1e-6;
}
#endif
/**************************************************************************/
// this prints all sequences that consist of m numbers from 0 to (n - 1), and
// return the number of them. for example, if m = 2 and n = 3 then it prints "0
// 0 / 0 1 / 0 2 / 1 0 / 1 1 / 1 2 / 2 0 / 2 1 / 2 2 / " and return 9.
int sequence(int m, int n) {
int i[100], j = 0, count = 0;
A:
if (!(j < m)) {
for (int k = 0; k < m; ++k) printf("%d ", i[k]);
printf("/ ");
++count;
goto C;
}
i[j] = 0;
B:
if (i[j] < n) {
++j;
goto A;
}
C:
--j;
if (j >= 0) {
++i[j];
goto B;
}
putchar('\n');
return count;
}
int sequence_substitute(int m, int n) {
int i[100], count = 0;
for (int j = 0; j < m; ++j) i[j] = 0;
for (;;) {
int j = m - 1;
for (int k = 0; k < m; ++k) printf("%d ", i[k]);
printf("/ ");
++count;
for (;;) {
if (i[j] < n - 1) {
++i[j];
break;
} else {
if (j == 0) {
putchar('\n');
return count;
} else {
i[j] = 0;
--j;
}
}
}
}
}
https://gist.github.com/yuchiki/26fd96a2791f7f6d2d9929b404a16da6
and the result is as follows:
when compiled with -O3,
and when compiled with -O0,
The results show that these two functions calculate the result in almost the same speed, even without any optimization.
Maybe, the code you show here is too vague to re-product the difference in speed you reported.
To discuss the phenomenon more precisely, it may be useful to show us the following information:
I tried another benchmark on the IO-free version of these two functions, by the following test code:
#include <stdio.h>
#include <stdlib.h>
double get_current_time();
int sequence(int m, int n);
int sequence_substitute(int m, int n);
double benchmark(int (*method)(int, int), int m, int n) {
double time1 = get_current_time();
method(m, n);
method(m, n);
method(m, n);
method(m, n);
method(m, n);
method(m, n);
method(m, n);
method(m, n);
method(m, n);
method(m, n);
double time2 = get_current_time();
return (time2 - time1) / 10;
}
int main(void) {
const int m = 7;
const int n = 15;
fprintf(stderr, "sequence: %f\n", benchmark(sequence, m, n));
fprintf(stderr, "sequence_substitute: %f\n",
benchmark(sequence_substitute, m, n));
return 0;
}
#if defined(WIN32) || defined(__WIN32) || defined(_WIN32) || \
defined(__WIN32__) || defined(_WIN32_)
#include <windows.h>
double get_current_time() {
LARGE_INTENGER t, f;
QueryPerformanceCounter(&t);
QueryPerformanceFrequency(&f);
return (double)t.QuadPart / (double)f.QuadPart;
}
#else
#include <sys/resource.h>
#include <sys/time.h>
double get_current_time() {
struct timeval t;
gettimeofday(&t, 0);
return t.tv_sec + t.tv_usec * 1e-6;
}
#endif
/**************************************************************************/
// this prints all sequences that consist of m numbers from 0 to (n - 1), and
// return the number of them. for example, if m = 2 and n = 3 then it prints "0
// 0 / 0 1 / 0 2 / 1 0 / 1 1 / 1 2 / 2 0 / 2 1 / 2 2 / " and return 9.
int sequence(int m, int n) {
int i[100], j = 0, count = 0;
A:
if (!(j < m)) {
for (int k = 0; k < m; ++k) {
} // printf("%d ", i[k]);
// printf("/ ");
++count;
goto C;
}
i[j] = 0;
B:
if (i[j] < n) {
++j;
goto A;
}
C:
--j;
if (j >= 0) {
++i[j];
goto B;
}
// putchar('\n');
return count;
}
int sequence_substitute(int m, int n) {
int i[100], count = 0;
for (int j = 0; j < m; ++j) i[j] = 0;
for (;;) {
int j = m - 1;
for (int k = 0; k < m; ++k) {
} // printf("%d ", i[k]);
// printf("/ ");
++count;
for (;;) {
if (i[j] < n - 1) {
++i[j];
break;
} else {
if (j == 0) {
// putchar('\n');
return count;
} else {
i[j] = 0;
--j;
}
}
}
}
}
the result is as follows:
when compiled with -O3,
when compiled with -O0,
sequence: 0.136406[sec] sequence_substitute: 0.112287[sec]
I do not think the result of -O3 version carry not so much meaning, because it can be guessed that most of the code is deleted by optimizer in this case. but -O0 version suggested the following facts:
Upvotes: 3
Reputation: 753695
Rather like yuchiki in their answer, I benchmarked the code. I also came up with my own solution to the problem.
I ran my tests on my MacBook Pro (15 inch, 2017) with 2.9 GHz Intel Core i7, running macOS 10.14.2 Mojave, and using a home-built GCC 8.2.0, plus the timing code available in my SOQ (Stack Overflow Questions) repository on GitHub as files timer.c
and timer.h
in the src/libsoq sub-directory. I renamed the sequence
function from the question to sequence_withgoto
to give its name a similar length to the other functions. I removed (by virtue of commenting out) the printing code in the sequence generator functions. I changed the counter type from int
to unsigned
(though one can argue that it could/should be unsigned long long
to give a bigger range). The maximal test shown below just precisely overflows a 32-bit unsigned
type, giving the answer 0.
Comment-free test code (source file seq23.c
):
#include <assert.h>
#include <stdio.h>
#include "timer.h"
static unsigned sequence_withgoto(int m, int n)
{
int i[100], j = 0;
unsigned count = 0;
A:
if (!(j < m))
{
++count;
goto C;
}
i[j] = 0;
B:
if (i[j] < n)
{
++j;
goto A;
}
C:
--j;
if (j >= 0)
{
++i[j];
goto B;
}
return count;
}
static unsigned sequence_substitute(int m, int n)
{
int i[100];
unsigned count = 0;
for (int j = 0; j < m; ++j)
i[j] = 0;
for ( ; ; )
{
int j = m - 1;
++count;
for ( ; ; )
{
if (i[j] < n - 1)
{
++i[j];
break;
}
else
{
if (j == 0)
{
return count;
}
else
{
i[j] = 0;
--j;
}
}
}
}
}
static unsigned generate_sequence(int m, int n)
{
assert(m <= n);
assert(m > 0);
assert(n < 20);
int data[m];
for (int i = 0; i < m; i++)
data[i] = 0;
unsigned counter = 0;
while (data[0] < n)
{
counter++;
for (int i = m - 1; i >= 0; i--)
{
if (++data[i] < n)
break;
if (i == 0)
break;
data[i] = 0;
}
}
return counter;
}
static void time_sequence_generator(const char *tag, int m, int n, unsigned (*function)(int m, int n))
{
Clock clk;
clk_init(&clk);
clk_start(&clk);
unsigned count = (*function)(m, n);
clk_stop(&clk);
char buffer[32];
printf("Number of sequences (m = %d, n = %d): %u elapsed = %s (%s)\n",
m, n, count, clk_elapsed_us(&clk, buffer, sizeof(buffer)), tag);
}
static void test_sequence_generators(int m, int n)
{
time_sequence_generator("generate_sequence", m, n, generate_sequence);
time_sequence_generator("sequence_withgoto", m, n, sequence_withgoto);
time_sequence_generator("sequence_substitute", m, n, sequence_substitute);
}
int main(void)
{
test_sequence_generators(2, 3);
test_sequence_generators(5, 9);
test_sequence_generators(4, 10);
test_sequence_generators(6, 15);
test_sequence_generators(7, 19);
test_sequence_generators(8, 16);
return 0;
}
Compilation command line:
gcc -O3 -g -I./inc -std=c11 -Wall -Wextra -Werror seq23.c -o seq23 -L./lib -lsoq
The headers are installed in ./inc
and the library containing the timer code is in ./lib
(in the static library libsoq.a
).
The results I got are striking and consistent over multiple runs:
Number of sequences (m = 2, n = 3): 9 elapsed = 0.000005 (generate_sequence)
Number of sequences (m = 2, n = 3): 9 elapsed = 0.000000 (sequence_withgoto)
Number of sequences (m = 2, n = 3): 9 elapsed = 0.000000 (sequence_substitute)
Number of sequences (m = 5, n = 9): 59049 elapsed = 0.000098 (generate_sequence)
Number of sequences (m = 5, n = 9): 59049 elapsed = 0.000119 (sequence_withgoto)
Number of sequences (m = 5, n = 9): 59049 elapsed = 0.000068 (sequence_substitute)
Number of sequences (m = 4, n = 10): 10000 elapsed = 0.000012 (generate_sequence)
Number of sequences (m = 4, n = 10): 10000 elapsed = 0.000015 (sequence_withgoto)
Number of sequences (m = 4, n = 10): 10000 elapsed = 0.000010 (sequence_substitute)
Number of sequences (m = 6, n = 15): 11390625 elapsed = 0.013260 (generate_sequence)
Number of sequences (m = 6, n = 15): 11390625 elapsed = 0.015959 (sequence_withgoto)
Number of sequences (m = 6, n = 15): 11390625 elapsed = 0.010123 (sequence_substitute)
Number of sequences (m = 7, n = 19): 893871739 elapsed = 1.064473 (generate_sequence)
Number of sequences (m = 7, n = 19): 893871739 elapsed = 1.206680 (sequence_withgoto)
Number of sequences (m = 7, n = 19): 893871739 elapsed = 0.758287 (sequence_substitute)
Number of sequences (m = 8, n = 16): 0 elapsed = 4.819932 (generate_sequence)
Number of sequences (m = 8, n = 16): 0 elapsed = 5.712081 (sequence_withgoto)
Number of sequences (m = 8, n = 16): 0 elapsed = 3.705033 (sequence_substitute)
The (m = 8, n = 16)
sequence generates 168 = 232 sequences, which means that the unsigned
counter overflows to 0
.
What's striking to me is that the sequence_withgoto()
is the slowest of the functions; my generate_sequence()
comes in the middle at all but the smallest problem sizes; and sequence_substitute()
from the question is fastest, by a noticeable margin (about 2/3 the time of the variant using goto
on the last sequence).
The algorithm I implemented is described in the commented file with:
/*
** Algorithm:
** Array data contains m values.
** The farthest right entry varies continuously; when it exceeds n-1, it
** is reset to 0. If the increment wraps back to 0, then the previous
** index is incremented (and if it wraps to zero, ...). The loop
** finishes when the zeroth index reaches n (wrapping is suppressed).
*/
To answer the headline question in the title:
goto
statements are not mandatory.To answer the speed question in the body:
goto
is not faster, in general, on large-size problems (and if there is any performance advantage on small problems, it is unlikely to be sufficient to be important).Upvotes: 0
Reputation: 39326
What you are focusing on, has no relevance in the real world. No, I am not chiding you; I am only pointing out that you are worrying about something that does not matter.
The most efficient implementation is the one that does no work.
In practice, the maintainability of code is way more important than absolute maximum performance. We learn more about computer science every day, and the hardware we uses changes, so what is absolutely optimal today won't be that tomorrow.
Having it be good enough, and easily modified/updated, is worth much more.
When one generates a sequence, the speed is not that important, because you only run it once, and save the results.
When you work with a sequence, you often cannot store and retrieve them (because either there is not enough matter in the universe, or I/O is too slow to be practical). Instead, you generate the explicit value in the sequence instead. This answer of mine, to a related question -- one where the values in the sequence have unique digits --, shows how you can do that. Essentially, you use the unsigned integer "index", starting at 0, as the seed to generate the value in the sequence.
Sometimes, however, you are just playing with code, or perhaps making a game of it (like code golf here at stack overflow). Or, perhaps you have multiple different implementations that do the exact same thing, and you wish to compare them fairly.
Then, the problem is the difference between microbenchmarking and benchmarking.
Our current computers are complex enough so that when considering concepts like speed or efficiency, completely different operations heavily affect each other. The most common mechanism is via the cache. Simply put, if your super-fast function uses a lot of cache, it can slow down other operations (because the data they need is no longer cached, and must be loaded from system RAM instead), so that the task the function is part of, is slowed down overall because of the function!
This means that to do proper benchmarks, proper analysis of the performance of programs, we need to use real-world payloads. Several different ones, in fact. And just try to form a complete picture of how each implementation performs; when they are efficient, when they are slow and resource-hungry, and so on.
A common problem is scalability. Some functions are faster than others when there is lots of data, but when the data set is small, the inverse may be true. This is especially common with sort functions. Some sort functions are very slow if there is not much data; but faster than any other function when you have enough data. (Radix sort for numerical data is a perfect example: it does sort in essentially linear time, but the linear factor is so large that you need arrays with millions of entries before radix sorting takes less time than other sort algorithms; radix sort also tends to be more cache-heavy than other algorithms. So, even though it is "superior" in certain terms, it is not often used.)
We use the term microbenchmark when we compare e.g. functions with synthetic test cases, in an effort to remind us humans that such tests are indicative, and not perfectly reliable, for the abovementioned reasons.
Microbenchmarking itself is an art to do correctly.
In particular, just averaging the time taken over a large number of runs, does not give a full picture. Unrelated events elsewhere in the system sometimes slow down individual runs (of the microbenchmarked function); in the worst cases, you perceive these as "jitters" with video playback, mouse movement, or animation. Because of such events and that other sources of measurement error have similar characteristics, the measured timings have asymmetric error bars: the measured duration is "never" too low, but it is often too high, because the timing involved those extraneous events.
Cache effects mean that if you call the same function over data stored in the same buffer, the first call will be slower than subsequent calls, as its timing will include the time taken to get the cache "hot"; to load the contents from RAM.
Because of those reasons, I recommend using the median, or some other percentile, instead of mean.
If you think about it, you are rarely interested in how long say 1000 consecutive calls might take, because you don't do those calls consecutively in real programs. However, if you know that the median time each function call takes is T, you know that in at least 50% of the microbenchmarked cases, the function call took T or less time.
There is also the issue of which clock to use on a computer. We can measure the wall clock time (clock_gettime(CLOCK_MONOTONIC, ×pec)
in POSIXy systems), or only the CPU time (clock_gettime(CLOCK_THREAD_CPUTIME_ID, ×pec)
) taken by that specific thread. The wall clock time is affected by everything else running on the same machine, but better corresponds to what a human can expect as to performance; the CPU time is better for mathematical calculations that do not involve significant amounts of data. (I'd say CPU time is better for functions like OP is wondering about.)
A final wrinkle in the matter is compiler optimizations. Compilers nowadays can do surprising optimizations at the machine code level, depending on the context in which the function is called. For real benchmarking, that is not an issue, but for microbenchmarking, we really need to isolate the functions into separate units, so that we can compare them.
(Of course, all results are specific to each machine, and vary depending on the compiler, compiler options, and libraries used. Just because microbenchmark X is faster on machine A than on machine B, does not mean that microbenchmark Y is also faster on A than on B. Remember: indicative, not proof or really reliable.)
Now that I have bored you to death with this wall of text, we can look at implementing a real-world microbenchmark for comparing such sequence generators fairly.
First, the I/O part will be the bottleneck. We throw that out. Instead, we use a local buffer to hold the string, and whenever a new value is completed, we call a callback function. This way the compiler cannot do any real bad shenanigans, like avoid calculating the value altogether, because it must assume the value is used.
void sequence(int m, int n, void (*callback)(int index, char seq[]))
{
int index[m + 1];
char buffer[m + 1];
int count = 0;
int j = 0;
buffer[m] = '\0';
A:
if (j >= m) {
callback(count, buffer);
++count;
goto C;
}
index[j] = 0;
B:
if (index[j] < n) {
++j;
goto A;
}
C:
--j;
if (j >= 0) {
++index[j];
goto B;
}
return;
}
I also renamed the i
array to index
. Descriptive names work better.
Let's say we put that into fiveseven.c. Let's also write a small Makefile to help us compile this stuff:
CC := gcc
CFLAGS := -Wall -O2
LDFLAGS :=
LIBS := fiveseven.so
.PHONY: all clean
all: clean benchmark $(LIBS)
clean:
rm -f benchmark $(LIBS)
%.so: %.c
$(CC) $(CFLAGS) -fPIC -shared $^ -Wl,-soname,$@ $(LDFLAGS) -o $@
benchmark: benchmark.c
$(CC) $(CFLAGS) $^ $(LDFLAGS) -ldl -o $@
Note that the indentation uses Tabs, not spaces; but this forum does not allow those in pasted code. So, if you copy-paste the above, run sed -e 's|^ *|\t|' -i Makefile
to fix the indentation.
This compiles the actual microbenchmarking program, from benchmark.c to ./benchmark, separately; and the function or functions to be compared, into dynamically loaded libraries. This avoids the compiler doing any surprising optimizations to the code.
If you add other implementations, just add their library name (ending with .so
on Linux) to the LIBS
line. Note that you can also run make CC=clang
or make LIBS=foo.so clean all
and so on, overriding the variables at build time.
Of course, we need the benchmarking program itself. Here is one implementation, benchmark.c:
#define _POSIX_C_SOURCE 200809L
// SPDX-License-Identifier: CC0-1.0
#include <stdlib.h>
#include <inttypes.h>
#include <dlfcn.h>
#include <stdio.h>
#include <time.h>
typedef void (sequence_fn)(int, int, void (*)(int, char []));
static int compare_int64(const void *ptr1, const void *ptr2)
{
const int64_t val1 = *(const uint64_t *)ptr1;
const int64_t val2 = *(const uint64_t *)ptr2;
return (val1 < val2) ? -1 :
(val1 > val2) ? +1 : 0;
}
static void nothing(int index, char value[])
{
return;
}
static double median_cpu_time(const size_t iterations,
const int length,
const int radix,
sequence_fn *sequence_function)
{
struct timespec started, stopped;
int64_t *nanosecs;
double result;
size_t i;
if (iterations < 1 || length < 1 || radix < 1 || !sequence_function)
return -1.0; /* Invalid parameters. */
nanosecs = malloc(iterations * sizeof nanosecs[0]);
if (!nanosecs)
return -1.0; /* Out of memory. */
for (i = 0; i < iterations; i++) {
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &started);
sequence_function(length, radix, nothing);
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &stopped);
nanosecs[i] = (int64_t)(stopped.tv_sec - started.tv_sec) * INT64_C(1000000000)
+ (int64_t)(stopped.tv_nsec - started.tv_nsec);
}
qsort(nanosecs, iterations, sizeof (int64_t), compare_int64);
result = (double)nanosecs[iterations / 2] / 1000000000.0;
free(nanosecs);
return result;
}
int main(int argc, char *argv[])
{
size_t iterations;
int arg, length, radix;
void *handle;
sequence_fn *func;
double seconds;
char dummy;
if (argc < 5) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: %s [ -h | --help | help ]\n", argv[0]);
fprintf(stderr, " %s LENGTH RADIX ITERATIONS ./library.so ...\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "This program measures the median CPU time taken by\n");
fprintf(stderr, "each sequence(LENGTH, RADIX, callback) call\n");
fprintf(stderr, "over ITERATIONS iterations, as implemented in each\n");
fprintf(stderr, "listed dynamic library.\n");
fprintf(stderr, "\n");
return EXIT_FAILURE;
}
if (sscanf(argv[1], " %d %c", &length, &dummy) != 1 || length < 1) {
fprintf(stderr, "%s: Invalid length.\n", argv[1]);
return EXIT_FAILURE;
}
if (sscanf(argv[2], " %d %c", &radix, &dummy) != 1 || radix < 1) {
fprintf(stderr, "%s: Invalid radix.\n", argv[2]);
return EXIT_FAILURE;
}
if (sscanf(argv[3], " %zu %c", &iterations, &dummy) != 1 || iterations < 1) {
fprintf(stderr, "%s: Invalid number of iterations.\n", argv[3]);
return EXIT_FAILURE;
}
printf("Reporting median CPU time used over %zu iterations.\n", iterations);
printf("Length = %d, radix = %d.\n", length, radix);
fflush(stdout);
for (arg = 4; arg < argc; arg++) {
handle = dlopen(argv[arg], RTLD_NOW);
if (!handle) {
fprintf(stderr, "%s: %s.\n", argv[arg], dlerror());
continue;
}
func = dlsym(handle, "sequence");
if (!func) {
fprintf(stderr, "%s: Library does not implement sequence().\n", argv[arg]);
dlclose(handle);
continue;
}
printf(" %s: ", argv[arg]);
fflush(stdout);
seconds = median_cpu_time(iterations, length, radix, func);
printf("%.9f seconds median per call\n", seconds);
fflush(stdout);
dlclose(handle);
}
return EXIT_SUCCESS;
}
Note that it provides a callback function that does nothing, and that most of the program is really just providing command-line usage information. That is surprisingly useful in practice. I tend to put each "project" into a separate directory/folder. When I'm looking for a specific thing, I do a find + grep or a grep -e pattern -R base-of-directories
, then examine likely candidates. If there is an executable I can run to see the usage (mine always have!) lets me see the purpose of that particular directory; and it is much faster and less tedious to read thousands of lines of code to try to recall if that is what I was looking for.
After you save the above, run e.g.
make clean all
./benchmark 5 4 100000 ./fiveseven.so
to see how much CPU time, in seconds, median, each call to OP's sequence()
function takes, including the overhead of calling a do-nothing function via a function pointer for every value generated.
Do note that this microbenchmark does nothing to verify the correctness of the results. It is something that should be considered as well.
Upvotes: 0