fati
fati

Reputation: 29

extract comma separated strings from txt file in C

I need to read from a file different strings that are comma-separated and storage them into an array.

I have the following code, that I developed reading different questions online.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int main (){
int N = 300; 
int L = 1000;
char Nseq[N][L];

FILE *myfile;
char *token;
const char s[2] = ",";
char line[300];
char* filename = "pathtofile.txt";
int n = 0;

myfile = fopen(filename, "r");
if (myfile == NULL) {printf("could not open file %s", filename); exit(0);}
while (fgets(line, sizeof(line), myfile) != NULL){
  token = strtok(line, s);
  while (token != NULL){
    strcpy(Nseq[n], token);
    printf("%s\t%u\n", token, n);
    token = strtok(NULL, s);
    n++;
  }
}
fclose(myfile);
}

my txt file is the following:

1AAAAAAAAAAAAAAAAAAAAAAAAAAAA,2AAAAAAAAAAAAAAAAAAAAAAAAAAAA,3AAAAAAAAAAAAAAAAAAAAAAAAAAAA,4AAAAAAAAAAAAAAAAAAAAAAAAAAAA,5AAAAAAAAAAAAAAAAAAAAAAAAAAAA,6AAAAAAAAAAAAAAAAAAAAAAAAAAAA,7AAAAAAAAAAAAAAAAAAAAAAAAAAAA,8AAAAAAAAAAAAAAAAAAAAAAAAAAAA,9AAAAAAAAAAAAAAAAAAAAAAAAAAAA,10AAAAAAAAAAAAAAAAAAAAAAAAAAAA,11AAAAAAAAAAAAAAAAAAAAAAAAAAAA,12AAAAAAAAAAAAAAAAAAAAAAAAAAAA,13AAAAAAAAAAAAAAAAAAAAAAAAAAAA,14AAAAAAAAAAAAAAAAAAAAAAAAAAAA,15AAAAAAAAAAAAAAAAAAAAAAAAAAAA,16AAAAAAAAAAAAAAAAAAAAAAAAAAAA,17AAAAAAAAAAAAAAAAAAAAAAAAAAAA,18AAAAAAAAAAAAAAAAAAAAAAAAAAAA,19AAAAAAAAAAAAAAAAAAAAAAAAAAAA,20AAAAAAAAAAAAAAAAAAAAAAAAAAAA,21AAAAAAAAAAAAAAAAAAAAAAAAAAAA,22AAAAAAAAAAAAAAAAAAAAAAAAAAAA,23AAAAAAAAAAAAAAAAAAAAAAAAAAAA,24AAAAAAAAAAAAAAAAAAAAAAAAAAAA,25AAAAAAAAAAAAAAAAAAAAAAAAAAAA,26AAAAAAAAAAAAAAAAAAAAAAAAAAAA,27AAAAAAAAAAAAAAAAAAAAAAAAAAAA,28AAAAAAAAAAAAAAAAAAAAAAAAAAAA,29AAAAAAAAAAAAAAAAAAAAAAAAAAAA,30AAAAAAAAAAAAAAAAAAAAAAAAAAAA,

There are 30 strings and no new-line characters.

My issue is that when I run the code, I get the following output:

1AAAAAAAAAAAAAAAAAAAAAAAAAAAA   0
2AAAAAAAAAAAAAAAAAAAAAAAAAAAA   1
3AAAAAAAAAAAAAAAAAAAAAAAAAAAA   2
4AAAAAAAAAAAAAAAAAAAAAAAAAAAA   3
5AAAAAAAAAAAAAAAAAAAAAAAAAAAA   4
6AAAAAAAAAAAAAAAAAAAAAAAAAAAA   5
7AAAAAAAAAAAAAAAAAAAAAAAAAAAA   6
8AAAAAAAAAAAAAAAAAAAAAAAAAAAA   7
9AAAAAAAAAAAAAAAAAAAAAAAAAAAA   8
10AAAAAAAAAAAAAAAAAAAAAAAAAAA   9
A       10
11AAAAAAAAAAAAAAAAAAAAAAAAAAAA  11
12AAAAAAAAAAAAAAAAAAAAAAAAAAAA  12
13AAAAAAAAAAAAAAAAAAAAAAAAAAAA  13
14AAAAAAAAAAAAAAAAAAAAAAAAAAAA  14
15AAAAAAAAAAAAAAAAAAAAAAAAAAAA  15
16AAAAAAAAAAAAAAAAAAAAAAAAAAAA  16
17AAAAAAAAAAAAAAAAAAAAAAAAAAAA  17
18AAAAAAAAAAAAAAAAAAAAAAAAAAAA  18
19AAAAAAAAAAAAAAAAAAAAAAAAAAAA  19
20AAAAAAAAAAAAAAAA      20
AAAAAAAAAAAA    21
21AAAAAAAAAAAAAAAAAAAAAAAAAAAA  22
22AAAAAAAAAAAAAAAAAAAAAAAAAAAA  23
23AAAAAAAAAAAAAAAAAAAAAAAAAAAA  24
24AAAAAAAAAAAAAAAAAAAAAAAAAAAA  25
25AAAAAAAAAAAAAAAAAAAAAAAAAAAA  26
26AAAAAAAAAAAAAAAAAAAAAAAAAAAA  27
27AAAAAAAAAAAAAAAAAAAAAAAAAAAA  28
28AAAAAAAAAAAAAAAAAAAAAAAAAAAA  29
29AAAAAAAAAAAAAAAAAAAAAAAAAAAA  30
30AAAAA 31
AAAAAAAAAAAAAAAAAAAAAAA 32

        33

I have tried with different lengths and sooner or later I get these weird splits.

Does someone know why is this happening? thank you!

Upvotes: 1

Views: 851

Answers (1)

Craig Estey
Craig Estey

Reputation: 33621

Your text file is 921 chars in size and is a single line.

Your line buffer is only 300 chars.

So, you're getting truncation.

Also, note that your file has no newline. And, your code didn't handle the case where there is a newline (particularly, if the line ended in ,<newline>).

The simple solution is to increase the size of line so that it is larger than the size of the file (e.g.) char line[10000];

The long term solution is to either read the file char-by-char with (e.g.) fgetc and copy into Nseq[n] and store/print the token after a delimiter.

Or, you can stat the file, and use malloc to allocate a buffer that is the file size.

But, although slightly more advanced, the fastest way [particularly for large files], is to stat the file, mmap it, and then scan the buffer. This will work well on any 64 bit machine, or you could map it in chunks on a 32 bit machine


Here's a version that uses fgetc:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int
main(void)
{
    int N = 300;
    int L = 1000;
    char Nseq[N][L];

    FILE *myfile;
    char *token;
    char *filename = "pathtofile.txt";
    int chr;
    int n = 0;

    myfile = fopen(filename, "r");
    if (myfile == NULL) {
        printf("could not open file %s", filename);
        exit(0);
    }

    token = Nseq[n];

    while (1) {
        chr = fgetc(myfile);
        if (chr == EOF)
            break;

        switch (chr) {
        case ',':
        case '\n':
            *token = 0;
            if (token > Nseq[n]) {
                printf("%s\t%u\n", Nseq[n], n);
                ++n;
            }
            token = Nseq[n];
            break;

        default:
            *token++ = chr;
            break;
        }
    }

    fclose(myfile);

    return 0;
}

Here's a version that uses malloc:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>

int
main(void)
{
    int N = 300;
    int L = 1000;
    char Nseq[N][L];

    FILE *myfile;
    char *token;
    const char s[2] = ",";
    char *line;
    int len;
    char *filename = "pathtofile.txt";
    int err;
    struct stat st;
    int n = 0;

    err = stat(filename,&st);
    if (err < 0) {
        printf("could not stat file %s", filename);
        exit(1);
    }
    len = st.st_size + 1;

    line = malloc(len);

    myfile = fopen(filename, "r");
    if (myfile == NULL) {
        printf("could not open file %s", filename);
        exit(1);
    }

    while (fgets(line, len, myfile) != NULL) {
        token = strtok(line, s);
        while (token != NULL) {
            strcpy(Nseq[n], token);
            printf("%s\t%u\n", token, n);
            token = strtok(NULL, s);
            n++;
        }
    }

    fclose(myfile);

    return 0;
}

Here's a version that uses mmap:

#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/mman.h>

int
main(void)
{
    int N = 300;
    int L = 1000;
    char Nseq[N][L];

    char *token;
    char *line;
    char *cur;
    char *end;
    char *filename = "pathtofile.txt";
    int fd;
    int chr;
    int n = 0;
    int err;
    struct stat st;
    size_t len;

    fd = open(filename,O_RDONLY);
    if (fd < 0) {
        printf("could not open file %s", filename);
        exit(1);
    }

    err = fstat(fd,&st);
    if (err < 0) {
        printf("could not stat file %s", filename);
        exit(1);
    }
    len = st.st_size;

    line = mmap(NULL,len,PROT_READ,MAP_PRIVATE,fd,0);
    if (line == MAP_FAILED) {
        printf("could not mmap file %s", filename);
        exit(1);
    }

    cur = line;
    end = &line[len];
    token = Nseq[n];

    for (cur = line;  cur < end;  ++cur) {
        chr = *cur;

        switch (chr) {
        case ',':
        case '\n':
            *token = 0;
            if (token > Nseq[n]) {
                printf("%s\t%u\n", Nseq[n], n);
                ++n;
            }
            token = Nseq[n];
            break;

        default:
            *token++ = chr;
            break;
        }
    }

    munmap(line,len);
    close(fd);

    return 0;
}

Upvotes: 2

Related Questions