Reputation:
I am trying to read CSV comma delimited file, content of file are
one,,three
And code to read file is this…
inFile.getline(line, 500);
token1 = strtok(line, ",");
token2 = strtok(NULL, ",");
token3 = strtok(NULL, ",");
if(token1 != NULL){
cout << "token1 = " << token1 << "\n";
}else{
cout << "token1 = null\n" ;
}
if(token2 != NULL){
cout << "token2 = " << token2 << "\n";
}else{
cout << "token2 = null\n" ;
}
if(token3 != NULL){
cout << "token3 = " << token3 << "\n";
}else{
cout << "token3 = null\n";
}
Output is this
token1 = one
token2 = three
token3 = null
Whereas my expectation are that output should be like this…
token1 = one
token2 = null
token3 = three
I did change if statements from
if(token1 != NULL)
To
if(token1)
But it as well doesn’t works.
After checking this example http://www.cplusplus.com/reference/cstring/strtok/, I have updated
token2 = strtok(NULL, ",");
To
token2 = strtok(NULL, ",,");
As well it does not works
Upvotes: 10
Views: 10166
Reputation: 33667
You can use strsep()
instead of strtok()
, the former treats multiple delimiters as empty tokens and returns all of them.
Unlike strtok()
, you don't have to call strsep()
with a NULL
first argument. You can call it like this:
#include <stdio.h>
#include <string.h>
int main(void) {
char string[] = "this,is,the,string,,,,you,want,to,parse";
char *strPtr = string;
char *token;
while (token = strsep(&strPtr, ",")) {
printf("Processing '%s'\n", token);
}
return 0;
}
The following output is produced by that program:
Processing 'this'
Processing 'is'
Processing 'the'
Processing 'string'
Processing ''
Processing ''
Processing ''
Processing 'you'
Processing 'want'
Processing 'to'
Processing 'parse'
If that while
condition makes you nervous or elicits a compiler warning, you can always check for NULL
explicitly:
while ((token = strsep(&strPtr, ",") != NULL))
Keep in mind that some old compiler libraries don't have strsep()
and, technically, it's not a part of the ISO standard, but it should be available in most implementations.
Upvotes: 6
Reputation: 422
This is improved and re-entrant version:
char *strtok_new_r(char * string, char const * delimiter, char **saveptr) {
char *ptr, *riturn = 0;
if (string != NULL) {
*saveptr = string;
}
if (*saveptr == NULL) {
return NULL;
}
if ((ptr = strpbrk(*saveptr, delimiter)) != NULL) {
*ptr = 0;
riturn = *saveptr;
*saveptr = ++ptr;
}
if (!ptr) {
if (*saveptr) {
riturn = *saveptr;
*saveptr = NULL;
}
}
return riturn;
}
Upvotes: 0
Reputation:
Once I did face this problem while reading CSV comma delimited file. But we can't use strtok()
as our solution in such problems where the delimiter character appears consecutively. Because according to the standard
The first call in the sequence searches the string pointed to by
s1
for the first character that is not contained in the current separator string pointed to bys2
. If no such character is found, then there are no tokens in the string pointed to bys1
and thestrtok
function returns a null pointer. If such a character is found, it is the start of the first token. C11 §7.24.5.8 3
So, for my case I defined another solution using strpbrk()
function which will also be useful for you.
#include<iostream.h>
char *strtok_new(char * string, char const * delimiter){
static char *source = NULL;
char *p, *riturn = 0;
if(string != NULL) source = string;
if(source == NULL) return NULL;
if((p = strpbrk (source, delimiter)) != NULL) {
*p = 0;
riturn = source;
source = ++p;
}
return riturn;
}
int main(){
char string[] = "one,,three,";
char delimiter[] = ",";
char * p = strtok_new(string, delimiter);
while(p){
if(*p) cout << p << endl;
else cout << "No data" << endl;
p = strtok_new(NULL, delimiter);
}
system("pause");
return 0;
}
Output
one
No data
three
Hope this is your desired output.
Upvotes: 9
Reputation: 881553
From the standard (C99, referenced from C++11 for compatibility features):
The first call in the sequence searches the string pointed to by s1 for the first character that is not contained in the current separator string pointed to by s2.
Each subsequent call, with a null pointer as the value of the first argument, starts searching from the saved pointer and behaves as described above.
That means that, when looking for the second token, it first skips over all characters that match any in the delimiter string. Hence, ,,
is treated as a single delimiter in your input string.
If you want a tokeniser to work differently to the way the standard one works, you'll have to look elsewhere unfortunately, such as the code below:
#include <string.h>
char *paxtok (char *str, char *seps) {
static char *tpos, *tkn, *pos = NULL;
static char savech;
// Specific actions for first and subsequent calls.
if (str != NULL) {
// First call, set pointer.
pos = str;
savech = 'x';
} else {
// Subsequent calls, check we've done first.
if (pos == NULL)
return NULL;
// Then put character back and advance.
while (*pos != '\0')
pos++;
*pos++ = savech;
}
// Detect previous end of string.
if (savech == '\0')
return NULL;
// Now we have pos pointing to first character.
// Find first separator or nul.
tpos = pos;
while (*tpos != '\0') {
tkn = strchr (seps, *tpos);
if (tkn != NULL)
break;
tpos++;
}
savech = *tpos;
*tpos = '\0';
return pos;
}
That, combined with the following test program, should give you what you need::
#include <stdio.h>
int usage (char *reason) {
fprintf (stderr, "ERROR: %s.\n", reason);
fprintf (stderr, "Usage: testprog <string> <separator>\n");
return 1;
}
int main (int argc, char *argv[]) {
if (argc != 3)
return usage ("wrong number of parameters");
printf ("Initial string is '%s'\n", argv[1]);
char *token = paxtok (argv[1], argv[2]);
while (token != NULL) {
printf ("Token is '%s'\n", token);
token = paxtok (NULL, argv[2]);
}
printf ("Final string is '%s'\n", argv[1]);
return 0;
}
That gives a full program so that you can test it, such as with the command:
testprog ,_start,,middle_,end, _,
which will tokenise the first string using the two separators in the second, the underscore and comma. The output of that shows how it works and you can see it picking up empty tokens, including at the start and end:
Initial string is ',_start,,middle_,end,'
Token is ''
Token is ''
Token is 'start'
Token is ''
Token is 'middle'
Token is ''
Token is 'end'
Token is ''
Final string is ',_start,,middle_,end,'
Just keep in mind that, using statics, it suffers from the same limitation as strtok
- you can't run two tokenising actions side by side. You could make a paxtok_r
to mirror strtok_r
but I'll leave that as an exercise for the reader.
Upvotes: 8
Reputation: 9570
http://www.cplusplus.com/reference/cstring/strtok/ says:
To determine the beginning and the end of a token, the function first scans from the starting location for the first character not contained in delimiters (which becomes the beginning of the token). And then scans starting from this beginning of the token for the first character contained in delimiters, which becomes the end of the token. The scan also stops if the terminating null character is found.
So, as the function 'scans (...) for the first character not contained in delimiters', then it skips any sequence of delimiter characters. That makes impossible for it to detect an 'empty token' between consecutive delimiters. You will have to scan the input string char-by-char yourself.
Upvotes: 2