Reputation: 29
I need to read from a file different strings that are comma-separated and storage them into an array.
I have the following code, that I developed reading different questions online.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main (){
int N = 300;
int L = 1000;
char Nseq[N][L];
FILE *myfile;
char *token;
const char s[2] = ",";
char line[300];
char* filename = "pathtofile.txt";
int n = 0;
myfile = fopen(filename, "r");
if (myfile == NULL) {printf("could not open file %s", filename); exit(0);}
while (fgets(line, sizeof(line), myfile) != NULL){
token = strtok(line, s);
while (token != NULL){
strcpy(Nseq[n], token);
printf("%s\t%u\n", token, n);
token = strtok(NULL, s);
n++;
}
}
fclose(myfile);
}
my txt file is the following:
1AAAAAAAAAAAAAAAAAAAAAAAAAAAA,2AAAAAAAAAAAAAAAAAAAAAAAAAAAA,3AAAAAAAAAAAAAAAAAAAAAAAAAAAA,4AAAAAAAAAAAAAAAAAAAAAAAAAAAA,5AAAAAAAAAAAAAAAAAAAAAAAAAAAA,6AAAAAAAAAAAAAAAAAAAAAAAAAAAA,7AAAAAAAAAAAAAAAAAAAAAAAAAAAA,8AAAAAAAAAAAAAAAAAAAAAAAAAAAA,9AAAAAAAAAAAAAAAAAAAAAAAAAAAA,10AAAAAAAAAAAAAAAAAAAAAAAAAAAA,11AAAAAAAAAAAAAAAAAAAAAAAAAAAA,12AAAAAAAAAAAAAAAAAAAAAAAAAAAA,13AAAAAAAAAAAAAAAAAAAAAAAAAAAA,14AAAAAAAAAAAAAAAAAAAAAAAAAAAA,15AAAAAAAAAAAAAAAAAAAAAAAAAAAA,16AAAAAAAAAAAAAAAAAAAAAAAAAAAA,17AAAAAAAAAAAAAAAAAAAAAAAAAAAA,18AAAAAAAAAAAAAAAAAAAAAAAAAAAA,19AAAAAAAAAAAAAAAAAAAAAAAAAAAA,20AAAAAAAAAAAAAAAAAAAAAAAAAAAA,21AAAAAAAAAAAAAAAAAAAAAAAAAAAA,22AAAAAAAAAAAAAAAAAAAAAAAAAAAA,23AAAAAAAAAAAAAAAAAAAAAAAAAAAA,24AAAAAAAAAAAAAAAAAAAAAAAAAAAA,25AAAAAAAAAAAAAAAAAAAAAAAAAAAA,26AAAAAAAAAAAAAAAAAAAAAAAAAAAA,27AAAAAAAAAAAAAAAAAAAAAAAAAAAA,28AAAAAAAAAAAAAAAAAAAAAAAAAAAA,29AAAAAAAAAAAAAAAAAAAAAAAAAAAA,30AAAAAAAAAAAAAAAAAAAAAAAAAAAA,
There are 30 strings and no new-line characters.
My issue is that when I run the code, I get the following output:
1AAAAAAAAAAAAAAAAAAAAAAAAAAAA 0
2AAAAAAAAAAAAAAAAAAAAAAAAAAAA 1
3AAAAAAAAAAAAAAAAAAAAAAAAAAAA 2
4AAAAAAAAAAAAAAAAAAAAAAAAAAAA 3
5AAAAAAAAAAAAAAAAAAAAAAAAAAAA 4
6AAAAAAAAAAAAAAAAAAAAAAAAAAAA 5
7AAAAAAAAAAAAAAAAAAAAAAAAAAAA 6
8AAAAAAAAAAAAAAAAAAAAAAAAAAAA 7
9AAAAAAAAAAAAAAAAAAAAAAAAAAAA 8
10AAAAAAAAAAAAAAAAAAAAAAAAAAA 9
A 10
11AAAAAAAAAAAAAAAAAAAAAAAAAAAA 11
12AAAAAAAAAAAAAAAAAAAAAAAAAAAA 12
13AAAAAAAAAAAAAAAAAAAAAAAAAAAA 13
14AAAAAAAAAAAAAAAAAAAAAAAAAAAA 14
15AAAAAAAAAAAAAAAAAAAAAAAAAAAA 15
16AAAAAAAAAAAAAAAAAAAAAAAAAAAA 16
17AAAAAAAAAAAAAAAAAAAAAAAAAAAA 17
18AAAAAAAAAAAAAAAAAAAAAAAAAAAA 18
19AAAAAAAAAAAAAAAAAAAAAAAAAAAA 19
20AAAAAAAAAAAAAAAA 20
AAAAAAAAAAAA 21
21AAAAAAAAAAAAAAAAAAAAAAAAAAAA 22
22AAAAAAAAAAAAAAAAAAAAAAAAAAAA 23
23AAAAAAAAAAAAAAAAAAAAAAAAAAAA 24
24AAAAAAAAAAAAAAAAAAAAAAAAAAAA 25
25AAAAAAAAAAAAAAAAAAAAAAAAAAAA 26
26AAAAAAAAAAAAAAAAAAAAAAAAAAAA 27
27AAAAAAAAAAAAAAAAAAAAAAAAAAAA 28
28AAAAAAAAAAAAAAAAAAAAAAAAAAAA 29
29AAAAAAAAAAAAAAAAAAAAAAAAAAAA 30
30AAAAA 31
AAAAAAAAAAAAAAAAAAAAAAA 32
33
I have tried with different lengths and sooner or later I get these weird splits.
Does someone know why is this happening? thank you!
Upvotes: 1
Views: 851
Reputation: 33621
Your text file is 921 chars in size and is a single line.
Your line
buffer is only 300 chars.
So, you're getting truncation.
Also, note that your file has no newline. And, your code didn't handle the case where there is a newline (particularly, if the line ended in ,<newline>
).
The simple solution is to increase the size of line
so that it is larger than the size of the file (e.g.) char line[10000];
The long term solution is to either read the file char-by-char with (e.g.) fgetc
and copy into Nseq[n]
and store/print the token after a delimiter.
Or, you can stat
the file, and use malloc
to allocate a buffer that is the file size.
But, although slightly more advanced, the fastest way [particularly for large files], is to stat
the file, mmap
it, and then scan the buffer. This will work well on any 64 bit machine, or you could map it in chunks on a 32 bit machine
Here's a version that uses fgetc
:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int
main(void)
{
int N = 300;
int L = 1000;
char Nseq[N][L];
FILE *myfile;
char *token;
char *filename = "pathtofile.txt";
int chr;
int n = 0;
myfile = fopen(filename, "r");
if (myfile == NULL) {
printf("could not open file %s", filename);
exit(0);
}
token = Nseq[n];
while (1) {
chr = fgetc(myfile);
if (chr == EOF)
break;
switch (chr) {
case ',':
case '\n':
*token = 0;
if (token > Nseq[n]) {
printf("%s\t%u\n", Nseq[n], n);
++n;
}
token = Nseq[n];
break;
default:
*token++ = chr;
break;
}
}
fclose(myfile);
return 0;
}
Here's a version that uses malloc
:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
int
main(void)
{
int N = 300;
int L = 1000;
char Nseq[N][L];
FILE *myfile;
char *token;
const char s[2] = ",";
char *line;
int len;
char *filename = "pathtofile.txt";
int err;
struct stat st;
int n = 0;
err = stat(filename,&st);
if (err < 0) {
printf("could not stat file %s", filename);
exit(1);
}
len = st.st_size + 1;
line = malloc(len);
myfile = fopen(filename, "r");
if (myfile == NULL) {
printf("could not open file %s", filename);
exit(1);
}
while (fgets(line, len, myfile) != NULL) {
token = strtok(line, s);
while (token != NULL) {
strcpy(Nseq[n], token);
printf("%s\t%u\n", token, n);
token = strtok(NULL, s);
n++;
}
}
fclose(myfile);
return 0;
}
Here's a version that uses mmap
:
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/mman.h>
int
main(void)
{
int N = 300;
int L = 1000;
char Nseq[N][L];
char *token;
char *line;
char *cur;
char *end;
char *filename = "pathtofile.txt";
int fd;
int chr;
int n = 0;
int err;
struct stat st;
size_t len;
fd = open(filename,O_RDONLY);
if (fd < 0) {
printf("could not open file %s", filename);
exit(1);
}
err = fstat(fd,&st);
if (err < 0) {
printf("could not stat file %s", filename);
exit(1);
}
len = st.st_size;
line = mmap(NULL,len,PROT_READ,MAP_PRIVATE,fd,0);
if (line == MAP_FAILED) {
printf("could not mmap file %s", filename);
exit(1);
}
cur = line;
end = &line[len];
token = Nseq[n];
for (cur = line; cur < end; ++cur) {
chr = *cur;
switch (chr) {
case ',':
case '\n':
*token = 0;
if (token > Nseq[n]) {
printf("%s\t%u\n", Nseq[n], n);
++n;
}
token = Nseq[n];
break;
default:
*token++ = chr;
break;
}
}
munmap(line,len);
close(fd);
return 0;
}
Upvotes: 2