Nik
Nik

Reputation: 11

Reading a C source file and skipping /**/ comments

I managed to write code to skip // comments in C source:

while (fgets(string, 10000, fin) != NULL)
{
    unsigned int i;
    for (i = 0; i < strlen(string); i++)
    {
        if ((string[i] == '/') && (string[i + 1] == '/'))
        {
            while (string[i += 1] != '\n')
                continue;
        } 
    //rest of the code...

I've tried to do similar thing for /**/ comments:

if ((string[i] == '/') && (string[i + 1] == '*'))
{
    while (string[i += 1] != '/')
        continue;
}

if ((string[i] == '*') && (string[i + 1] == '/'))
{
    while (string[i -= 1])
        continue;
}

But it reads line by line and if I have, for example,

/*

text*/

then it counts the text.

How do I fix this?

Upvotes: 0

Views: 2522

Answers (6)

JJoao
JJoao

Reputation: 5347

(It is not very clear what your program is trying to do.)

Using flex to count the number of characters outside comments:

%option noyywrap

%%
   int i = 0;

\"([^\\"]|\\.)*\"          { i += yyleng ; }       // treatment of strings
\/\/.*                     {               }       // C++ comments
\/\*([^*]|\*[^/])*\*\/     {               }       // C  comments
.|\n                       { i += yyleng ; }       // normal chars

<<EOF>>                    { printf("%d\n",i); return;}
%%

int main(){ 
  yylex(); 
  return 0;}

and

$ flex count-non-com.fl
$ cc -o count-non-com lex.yy.c
$ count-non-com < input

One last example: flex code to remove comments (thanks @LuisColorado)

%option noyywrap 
%%

\"([^\\"]|\\.)*\"          { ECHO; }       // treatment of strings
\/\/.*                     {       }       // C++ comments
\/\*([^*]|\*[^/])*\*\/     {       }       // C  comments
.|\n                       { ECHO; }       // normal chars

%%

int main(){ 
  yylex(); 
  return 0;}

Upvotes: 0

Luis Colorado
Luis Colorado

Reputation: 12668

A simple regular expression for a C comment is:

/\*([^\*]|\*[^\/])*\*\//

(Sorry for the escape characters) This allows any sequence inside a comment except */. It translates to the following DFA (four states):

  • state 0, input /, next state 1, output none
  • state 0, input other, next state 0, output read char
  • state 1, input *, next state 2, no output
  • state 1, input /, next state 1, output /
  • state 1, input other, next state 0, output / and read char
  • state 2, input *, next state 3, output none
  • state 2, input other, next state 3, output none
  • state 3, input /, next state 0, output none
  • state 3, input *, next state 3, output none
  • state 3, input other, next state 2, output none

The possible inputs are /, * and any other character. The possible outputs are output read char, output / and output *.

This translates to the following code:

file uncomment.c:

#include <stdio.h>

int main()
{
    int c, st = 0;
    while ((c = getchar()) != EOF) {
        switch (st) {
        case 0: /* initial state */
            switch (c) {
            case '/': st = 1; break;
            default: putchar(c); break;
            } /* switch */
            break;
        case 1: /* we have read "/" */
            switch (c) {
            case '/': putchar('/'); break;
            case '*': st = 2; break;
            default: putchar('/'); putchar(c); st = 0; break;
            } /* switch */
            break;
        case 2: /* we have read "/*" */
            switch (c) {
            case '*': st = 3; break;
            default: break;
            } /* switch */
            break;
        case 3: /* we have read "/* ... *" */
            switch (c) {
            case '/': st = 0; break;
            case '*': break;
            default: st = 2; break;
            } /* switch */
            break;
        } /* switch */
    } /* while */
} /* main */

In case you want to exclude both types of comments, we need to switch to a fifth state when receiving a second /, resulting in the following code:

file uncomment2.c:

#include <stdio.h>

int main()
{
    int c, st = 0;
    while ((c = getchar()) != EOF) {
        switch (st) {
        case 0: /* initial state */
            switch (c) {
            case '/': st = 1; break;
            default: putchar(c); break;
            } /* switch */
            break;
        case 1: /* we have read "/" */
            switch (c) {
            case '/': st = 4; break;
            case '*': st = 2; break;
            default: putchar('/'); putchar(c); st = 0; break;
            } /* switch */
            break;
        case 2: /* we have read "/*" */
            switch (c) {
            case '*': st = 3; break;
            default: break;
            } /* switch */
            break;
        case 3: /* we have read "/* ... *" */
            switch (c) {
            case '/': st = 0; break;
            case '*': break;
            default: st = 2; break;
            } /* switch */
            break;
        // in the next line we put // inside an `old' comment
        // to illustrate this special case.  The switch has been put
        // after the comment to show it is not being commented out.
        case 4: /* we have read "// ..." */ switch(c) {
            case '\n': st = 0; putchar('\n'); break;
            } // switch  (to illustrate this kind of comment).
        } /* switch */
    } /* while */
} /* main */

Upvotes: 2

Kantajit
Kantajit

Reputation: 418

As user279599 just said,use an integer variable as flag,whenever you get '/' & '' consecutively set flag up(flag=1),then flag value remains 1 until get '' & '/' consecutively. Ignore every character when the flag is 1.

Upvotes: -1

user279599
user279599

Reputation: 1

Make an int variable. Scan the characters and store the index if you get /*. Continue scanning until you get */. If the variable !=0 at that time, then assume this is the closing comment token and ignore the characters in between.

Upvotes: -1

This simple code can ignore the comment /* */ ( doesn't treat all the cases for instance writing /* inside a string between quotes for a variable in c code )

#include <stdio.h> 
#include <string.h> 

typedef enum bool // false = 0 and true = 1
{ false,true}bool;
int main(int argc, char *argv[])
{
     FILE* file=fopen("file","r"); // open the file 
     bool comment=false;
     char str[1001]; // string that will contain portion of the file each time     

     if (file!=NULL)
     {
         while (fgets(str,sizeof(str),file)!=NULL)
         {
             int i=0;
             for (i=0;i<strlen(str);i++)
             {
                 if (str[i]=='/' && str[i+1] == '*')
                 {
                     comment=true; // comment true we will ignore till the end of comment
                     i++; // skip the * character 
                 }
                 else if (str[i]=='*' && str[i+1] == '/')
                 {
                     comment=false; 
                     i++; // skip the / character
                 }
                 else if (comment==false)
                 {
                     printf("%c",str[i]); // if the character not inside comment print it
                 }
             }
         }
         fclose(file);
     }

     return 0;
}

Upvotes: 0

John Bollinger
John Bollinger

Reputation: 180306

Even your supposedly-working code has several problems:

  1. It does not recognize any context, so it will treat // appearing within a string constant or within a /* ... */ comment as the beginning of a comment.
  2. In the unlikely event that you happen to have very long lines, they will be truncated (including their terminating newlines).

In the end, C is a stream-oriented language, not a line-oriented language. It should be parsed that way (character by character). To do the job right, you really need to implement a much more sophisticated parser. If you're up for learning a new tool, then you could consider basing your program on the Flex lexical analyzer.

Upvotes: 3

Related Questions