David542
David542

Reputation: 110163

Usage of getc with a file

To print the contents of a file one can use getc:

int ch;
FILE *file = fopen("file.txt", "r");
while ((ch = getc(file)) != EOF) {
    // do something
}

How efficient is the getc function? That is, how frequently does it actually do operating system calls or something that would take a non-trivial amount of time? For example, let's say I had a 10TB file -- would calling this function trillions of times be a poor way to get through data?

Upvotes: 2

Views: 785

Answers (2)

qwr
qwr

Reputation: 10919

To follow up on Basile Starynkevitch's excellent answer, let's go down the glibc rabbit hole! *As of writing this answer

libio/getc.c

#undef _IO_getc

int
_IO_getc (FILE *fp)
{
  int result;
  CHECK_FILE (fp, EOF);
  if (!_IO_need_lock (fp))
    return _IO_getc_unlocked (fp);
  _IO_acquire_lock (fp);
  result = _IO_getc_unlocked (fp);
  _IO_release_lock (fp);
  return result;
}

#undef getc

weak_alias (_IO_getc, getc)
weak_alias (_IO_getc, fgetc)

This aquires a lock if necessary, so we can safely do file I/O without worrying about other threads.

libio/libio.h

#define _IO_getc_unlocked(_fp) __getc_unlocked_body (_fp)

libio/bits/types/struct_FILE.h

/* These macros are used by bits/stdio.h and internal headers.  */
#define __getc_unlocked_body(_fp)                   \
  (__glibc_unlikely ((_fp)->_IO_read_ptr >= (_fp)->_IO_read_end)    \
   ? __uflow (_fp) : *(unsigned char *) (_fp)->_IO_read_ptr++)

So all it really does is return what's at the file read pointer and increment it! But linux uses an I/O buffer of default 8192 bytes (BUFSIZ). The only thing that needs to be handled is if we read past the read area, in which we return EOF or call a virtual uflow function, which eventually somehow makes its way to underflow function.

libio/genops.c

int
__uflow (FILE *fp)
{
  if (_IO_vtable_offset (fp) == 0 && _IO_fwide (fp, -1) != -1)
    return EOF;

  if (fp->_mode == 0)
    _IO_fwide (fp, -1);
  if (_IO_in_put_mode (fp))
    if (_IO_switch_to_get_mode (fp) == EOF)
      return EOF;
  if (fp->_IO_read_ptr < fp->_IO_read_end)
    return *(unsigned char *) fp->_IO_read_ptr++;
  if (_IO_in_backup (fp))
    {
      _IO_switch_to_main_get_area (fp);
      if (fp->_IO_read_ptr < fp->_IO_read_end)
    return *(unsigned char *) fp->_IO_read_ptr++;
    }
  if (_IO_have_markers (fp))
    {
      if (save_for_backup (fp, fp->_IO_read_end))
    return EOF;
    }
  else if (_IO_have_backup (fp))
    _IO_free_backup_area (fp);
  return _IO_UFLOW (fp);
}
libc_hidden_def (__uflow)

libio/libioP.h

/* THE JUMPTABLE FUNCTIONS.

 * The _IO_FILE type is used to implement the FILE type in GNU libc,
 * as well as the streambuf class in GNU iostreams for C++.
 * These are all the same, just used differently.
 * An _IO_FILE (or FILE) object is allows followed by a pointer to
 * a jump table (of pointers to functions).  The pointer is accessed
 * with the _IO_JUMPS macro.  The jump table has an eccentric format,
 * so as to be compatible with the layout of a C++ virtual function table.
 * (as implemented by g++).  When a pointer to a streambuf object is
 * coerced to an (FILE*), then _IO_JUMPS on the result just
 * happens to point to the virtual function table of the streambuf.
 * Thus the _IO_JUMPS function table used for C stdio/libio does
 * double duty as the virtual function table for C++ streambuf.
 *
 * The entries in the _IO_JUMPS function table (and hence also the
 * virtual functions of a streambuf) are described below.
 * The first parameter of each function entry is the _IO_FILE/streambuf
 * object being acted on (i.e. the 'this' parameter).
 */
/* The 'uflow' hook returns the next character in the input stream
   (cast to unsigned char), and increments the read position;
   EOF is returned on failure.
   It matches the streambuf::uflow virtual function, which is not in the
   cfront implementation, but was added to C++ by the ANSI/ISO committee. */
#define _IO_UFLOW(FP) JUMP0 (__uflow, FP)
struct _IO_jump_t
{
    JUMP_FIELD(size_t, __dummy);
    JUMP_FIELD(size_t, __dummy2);
    JUMP_FIELD(_IO_finish_t, __finish);
    JUMP_FIELD(_IO_overflow_t, __overflow);
    JUMP_FIELD(_IO_underflow_t, __underflow);
    JUMP_FIELD(_IO_underflow_t, __uflow);
    JUMP_FIELD(_IO_pbackfail_t, __pbackfail);
    /* showmany */
    JUMP_FIELD(_IO_xsputn_t, __xsputn);
    JUMP_FIELD(_IO_xsgetn_t, __xsgetn);
    JUMP_FIELD(_IO_seekoff_t, __seekoff);
    JUMP_FIELD(_IO_seekpos_t, __seekpos);
    JUMP_FIELD(_IO_setbuf_t, __setbuf);
    JUMP_FIELD(_IO_sync_t, __sync);
    JUMP_FIELD(_IO_doallocate_t, __doallocate);
    JUMP_FIELD(_IO_read_t, __read);
    JUMP_FIELD(_IO_write_t, __write);
    JUMP_FIELD(_IO_seek_t, __seek);
    JUMP_FIELD(_IO_close_t, __close);
    JUMP_FIELD(_IO_stat_t, __stat);
    JUMP_FIELD(_IO_showmanyc_t, __showmanyc);
    JUMP_FIELD(_IO_imbue_t, __imbue);
};

Disclaimer: I don't know how this vtable works. It seems to match C++'s std::streambuf. I believe the virtual function uflow by default calls these underflow functions which actually handles the buffers/mmap. glibc: When and where is the stdio stream buffer allocated and initialized? (Looking through the commit history, it looks like Ulrich Drepper and Roland McGrath wrote most of this in the late 90s / early 2000s. That original C code is still running!)

libio/fileops.c

int
_IO_new_file_underflow (FILE *fp)
{
  ssize_t count;

  /* C99 requires EOF to be "sticky".  */
  if (fp->_flags & _IO_EOF_SEEN)
    return EOF;

  if (fp->_flags & _IO_NO_READS)
    {
      fp->_flags |= _IO_ERR_SEEN;
      __set_errno (EBADF);
      return EOF;
    }
  if (fp->_IO_read_ptr < fp->_IO_read_end)
    return *(unsigned char *) fp->_IO_read_ptr;

  if (fp->_IO_buf_base == NULL)
    {
      /* Maybe we already have a push back pointer.  */
      if (fp->_IO_save_base != NULL)
    {
      free (fp->_IO_save_base);
      fp->_flags &= ~_IO_IN_BACKUP;
    }
      _IO_doallocbuf (fp);
    }

  /* FIXME This can/should be moved to genops ?? */
  if (fp->_flags & (_IO_LINE_BUF|_IO_UNBUFFERED))
    {
      /* We used to flush all line-buffered stream.  This really isn't
     required by any standard.  My recollection is that
     traditional Unix systems did this for stdout.  stderr better
     not be line buffered.  So we do just that here
     explicitly.  --drepper */
      _IO_acquire_lock (_IO_stdout);

      if ((_IO_stdout->_flags & (_IO_LINKED | _IO_NO_WRITES | _IO_LINE_BUF))
      == (_IO_LINKED | _IO_LINE_BUF))
    _IO_OVERFLOW (_IO_stdout, EOF);

      _IO_release_lock (_IO_stdout);
    }

  _IO_switch_to_get_mode (fp);

  /* This is very tricky. We have to adjust those
     pointers before we call _IO_SYSREAD () since
     we may longjump () out while waiting for
     input. Those pointers may be screwed up. H.J. */
  fp->_IO_read_base = fp->_IO_read_ptr = fp->_IO_buf_base;
  fp->_IO_read_end = fp->_IO_buf_base;
  fp->_IO_write_base = fp->_IO_write_ptr = fp->_IO_write_end
    = fp->_IO_buf_base;

  count = _IO_SYSREAD (fp, fp->_IO_buf_base,
               fp->_IO_buf_end - fp->_IO_buf_base);
  if (count <= 0)
    {
      if (count == 0)
    fp->_flags |= _IO_EOF_SEEN;
      else
    fp->_flags |= _IO_ERR_SEEN, count = 0;
  }
  fp->_IO_read_end += count;
  if (count == 0)
    {
      /* If a stream is read to EOF, the calling application may switch active
     handles.  As a result, our offset cache would no longer be valid, so
     unset it.  */
      fp->_offset = _IO_pos_BAD;
      return EOF;
    }
  if (fp->_offset != _IO_pos_BAD)
    _IO_pos_adjust (fp->_offset, count);
  return *(unsigned char *) fp->_IO_read_ptr;
}
libc_hidden_ver (_IO_new_file_underflow, _IO_file_underflow)
/* Guts of underflow callback if we mmap the file.  This stats the file and
   updates the stream state to match.  In the normal case we return zero.
   If the file is no longer eligible for mmap, its jump tables are reset to
   the vanilla ones and we return nonzero.  */
static int
mmap_remap_check (FILE *fp)
{
  struct stat64 st;

  if (_IO_SYSSTAT (fp, &st) == 0
      && S_ISREG (st.st_mode) && st.st_size != 0
      /* Limit the file size to 1MB for 32-bit machines.  */
      && (sizeof (ptrdiff_t) > 4 || st.st_size < 1*1024*1024))
    {
      const size_t pagesize = __getpagesize ();
# define ROUNDED(x) (((x) + pagesize - 1) & ~(pagesize - 1))
      if (ROUNDED (st.st_size) < ROUNDED (fp->_IO_buf_end
                      - fp->_IO_buf_base))
    {
      /* We can trim off some pages past the end of the file.  */
      (void) __munmap (fp->_IO_buf_base + ROUNDED (st.st_size),
               ROUNDED (fp->_IO_buf_end - fp->_IO_buf_base)
               - ROUNDED (st.st_size));
      fp->_IO_buf_end = fp->_IO_buf_base + st.st_size;
    }
      else if (ROUNDED (st.st_size) > ROUNDED (fp->_IO_buf_end
                           - fp->_IO_buf_base))
    {
      /* The file added some pages.  We need to remap it.  */
      void *p;
#if _G_HAVE_MREMAP
      p = __mremap (fp->_IO_buf_base, ROUNDED (fp->_IO_buf_end
                           - fp->_IO_buf_base),
            ROUNDED (st.st_size), MREMAP_MAYMOVE);
      if (p == MAP_FAILED)
        {
          (void) __munmap (fp->_IO_buf_base,
                   fp->_IO_buf_end - fp->_IO_buf_base);
          goto punt;
        }
#else
      (void) __munmap (fp->_IO_buf_base,
               fp->_IO_buf_end - fp->_IO_buf_base);
      p = __mmap64 (NULL, st.st_size, PROT_READ, MAP_SHARED,
            fp->_fileno, 0);
      if (p == MAP_FAILED)
        goto punt;
#endif
      fp->_IO_buf_base = p;
      fp->_IO_buf_end = fp->_IO_buf_base + st.st_size;
    }
      else
    {
      /* The number of pages didn't change.  */
      fp->_IO_buf_end = fp->_IO_buf_base + st.st_size;
    }
# undef ROUNDED

      fp->_offset -= fp->_IO_read_end - fp->_IO_read_ptr;
      _IO_setg (fp, fp->_IO_buf_base,
        fp->_offset < fp->_IO_buf_end - fp->_IO_buf_base
        ? fp->_IO_buf_base + fp->_offset : fp->_IO_buf_end,
        fp->_IO_buf_end);

      /* If we are already positioned at or past the end of the file, don't
     change the current offset.  If not, seek past what we have mapped,
     mimicking the position left by a normal underflow reading into its
     buffer until EOF.  */

      if (fp->_offset < fp->_IO_buf_end - fp->_IO_buf_base)
    {
      if (__lseek64 (fp->_fileno, fp->_IO_buf_end - fp->_IO_buf_base,
             SEEK_SET)
          != fp->_IO_buf_end - fp->_IO_buf_base)
        fp->_flags |= _IO_ERR_SEEN;
      else
        fp->_offset = fp->_IO_buf_end - fp->_IO_buf_base;
    }

      return 0;
    }
  else
    {
      /* Life is no longer good for mmap.  Punt it.  */
      (void) __munmap (fp->_IO_buf_base,
               fp->_IO_buf_end - fp->_IO_buf_base);
    punt:
      fp->_IO_buf_base = fp->_IO_buf_end = NULL;
      _IO_setg (fp, NULL, NULL, NULL);
      if (fp->_mode <= 0)
    _IO_JUMPS_FILE_plus (fp) = &_IO_file_jumps;
      else
    _IO_JUMPS_FILE_plus (fp) = &_IO_wfile_jumps;
      fp->_wide_data->_wide_vtable = &_IO_wfile_jumps;

      return 1;
    }
}

TL;DR: As David C. Rankin says in the OP comments, the moral of the story is getc isn't horrendously slow because linux has a 8KB stream buffer for efficient I/O. getc simply reads one byte at a time, and upon reaching the end of the buffer, the buffer is refreshed with more data or EOF is returned.

Upvotes: 1

That is, how frequently does it actually do operating system calls or something that would take a non-trivial amount of time?

You could look into the source code of GNU libc or of musl-libc to study the implementation of getc. You should also study the implementation of cat(1) and wc(1). Both are open source. And GNU as (part of GNU binutils) is a free software (internally used by most compilations by GCC) which in practice runs very quickly and does textual manipulation (transforming assembler textual input to binary object files). You could take inspiration from its source code.

You could change the buffer size with setvbuf(3)

You may want to read several bytes at once using fread(3) or fgets(3), probably by data pieces of several kilobytes

You can also use the debugger gdb(1) or the strace(1) utility to find out when syscalls(2) are used and which ones.

For example, let's say I had a 10TB file -- would calling this function trillions of times be a poor way to get through data?

Very probably not, because of the kernel's page cache.

You should profile and benchmark your program to find out its bottleneck.

Most of the time it won't be getc. See time(7) and gprof(1) (and compile all your code with GCC invoked as gcc -O3 -pg -Wall)

If raw input performance is critical in your program, consider also using directly and wisely open(2), read(2), mmap(2), madvise(2), readahead(2), posix_fadvise(2), close(2). Most of these syscalls could fail, see errno(3).

You may also change your file system (e.g. from Ext4 to XFS, see ext4(5) and xfs(5)), buy better SSD disks or more physical RAM, or play with mount(2) options, to improve performance.

See also the /proc pseudo-file system (so proc(5)...); and this answer.

You may want to use databases like sqlite or PostGreSQL

Your program could generate C code at runtime (like manydl.c does), try various approaches (compiling the generated C code /tmp/generated-c-1234.c as a plugin using gcc -O3 -fPIC /tmp/generated-c-1234.c -shared -o /tmp/generated-plugin-1234.so, then dlopen(3)-ing and dlsym(3)-ing that /tmp/generated-plugin-1234.so generated plugin), and use machine learning techniques to find a very good one (specific to the current hardware and computer). It could also generate machine code more directly using asmjit or libgccjit, try several approaches, and choose the best one for the particular situation. Pitrat's book Artificial Beings and blog (still here) explains in more details this approach. The conceptual framework is called partial evaluation. See also this.

You could also use existing parser generators like GNU bison or ANTLR. They are generating C code.

Ian Taylor's libbacktrace could also be useful in such a dynamic metaprogramming approach (generating various form of C code, and choosing the best ones according to the call stack inspected with dladdr(3)).

Very probably your problem is a parsing problem. So read the first half of the Dragon book.

Before attempting any experimentation, discuss with your manager/boss/client the opportunity to spend months of full time work to gain a few percent of performance. Take into account that the same gain can be obtained by upgrading the hardware.

If your terabyte input textual file does not change often (e.g. is given every week, e.g. in bioinformatics software), it may be worthwhile to preprocess it and transform it -in batch mode- into a binary file, or some sqlite database, or some GDBM indexed file, or a some REDIS thing. Then documenting the format of that binary file or database (using EBNF notation, taking inspiration from elf(5)) is very important.

Upvotes: 4

Related Questions