Steve Lau
Steve Lau

Reputation: 1003

List all the hole and data segments in a sparse file

I am trying to implement a program which can print all the hole and data segments in a regular sparse file using lseek(2) and its arguments SEEK_DATA and SEEK_HOLE, which is something like:

$ ./list_hold_and_data_segs sparse_file
This file has 100 bytes
[0, 10]: hole
[11, 99]: data(end)

Implementation

/*
 * list_hole_and_data_segs.c
*/
#define _GNU_SOURCE

#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

enum Type {
    HOLE,
    DATA,
};

void find_all_holes(int fd);

int main(int ac, char *av[])
{
    int fd = open(av[1], O_RDONLY);
    if (fd == -1) {
        perror("open");
        exit(EXIT_FAILURE);
    }

    find_all_holes(fd);
    return 0;
}

void find_all_holes(int fd)
{
    off_t cur_offset = 0; // current offset
    enum Type cur_type; // current byte type

    off_t file_size = lseek(fd, 0, SEEK_END);
    off_t index_of_last_byte = file_size - 1;

    printf("This file has %ld bytes\n", file_size);

    // check the type of byte 0
    off_t res = lseek(fd, 0, SEEK_HOLE);
    if (res == 0) {
        cur_type = HOLE;
    } else if (res == file_size) {
        printf("[0, %ld]: data(then exit)\n", index_of_last_byte);
        exit(0);
    } else {
        cur_type = DATA;
        cur_offset = res;
    }

    while (cur_offset <= index_of_last_byte) {
        off_t new_offset =lseek(fd, cur_offset,
                  ((cur_type == DATA) ? SEEK_HOLE : SEEK_DATA));
        if ((cur_type == HOLE && new_offset == -1 && errno == ENXIO) ||
            (cur_type == DATA && new_offset == file_size)) {
            // from current position to the end of this file: `cur_type`
            printf("[%ld, %ld]: %s(end)\n", cur_offset,
                   index_of_last_byte,
                   ((cur_type == DATA) ? "data" : "hole"));
            break; // exit of while loop
        } else {
            // from current offset to the new offset: `cur_type`
            printf("[%ld, %ld]: %s\n", cur_offset, new_offset - 1,
                   ((cur_type == DATA) ? "data" : "hole"));

            cur_offset = new_offset;
            cur_type = (cur_type == DATA) ? HOLE : DATA;
        }
    }
}

Test my implementation

I use the following code snippet to create a sparse file, error handling is omitted for simplicity:

/*
 * create_sparse_file.c
*/
#include <fcntl.h>
#include <unistd.h>

int main(void)
{
    int fd = open("sparse_file", O_CREAT | O_WRONLY | O_TRUNC, 0666);
    lseek(fd, 10000, SEEK_CUR);
    write(fd, "HELLO", 5);
    close(fd);
    return 0;
}
$ gcc create_sparse_file.c -o create_sparse_file && ./create_sparse_file

$ stat sparse_file
  File: sparse_file
  Size: 10005           Blocks: 8          IO Block: 4096   regular file
Device: 803h/2051d      Inode: 3556105     Links: 1

# create a normal file as a comparision
$ cp sparse_file not_sparse_file --sparse=never
$ stat not_sparse_file
  File: not_sparse_file
  Size: 10005           Blocks: 24         IO Block: 4096   regular file
Device: 803h/2051d      Inode: 3557867     Links: 1

$ gcc list_hole_and_data_segs.c -o list_hole_and_data_segs

$ ./list_hole_and_data_segs sparse_file
This file has 10005 bytes
[0, 8191]: hole
[8192, 10004]: data(end)

Question

As you can see, the output of ./list_hole_and_data_seg sparse_file is:

[0, 8191]: hole
[8192, 10004]: data(end)

And the real case is:

[0, 9999]: hole
[10000, 10004]: data(end)

What makes the behavior of list_hole_and_data_seg not consistent with the real case and how to make it correct?

Environment

$ uname -a
Linux pop-os 5.17.15-76051715-generic #202206141358~1655919116~22.04~1db9e34 SMP PREEMPT Wed Jun 22 19 x86_64 x86_64 x86_64 GNU/Linux

$ df -hT .
Filesystem     Type  Size  Used Avail Use% Mounted on
/dev/sda3      ext4  103G   54G   44G  56% /

$ stat -f .
  File: "."
    ID: 4885eb446c106708 Namelen: 255     Type: ext2/ext3
Block size: 4096       Fundamental block size: 4096
Blocks: Total: 26819732   Free: 12805152   Available: 11431226
Inodes: Total: 6856704    Free: 6062138

$ gcc --version
gcc (Ubuntu 11.2.0-19ubuntu1) 11.2.0

$ ldd --version
ldd (Ubuntu GLIBC 2.35-0ubuntu3) 2.35

Upvotes: 0

Views: 302

Answers (0)

Related Questions