Cajuu'
Cajuu'

Reputation: 1166

Compare each line from two different files and print the lines that are different in C

Supposing that I have two files like this:

file1.txt

john
is
the new
guy

file2.txt

man
the old
is
rick
cat
dog

I'd like to compare first line from file1 with all the lines from file2 and verify if it exist. If not, go two the second line from file1 and compare it with all the lines from file2.. and so on until eof is reached by file1.

The output that I expect is:

john
the new
guy

How I thought this should be done:

Now, I don't know what I'm doing wrong, but I don't get the result that I expect:

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

int countlines(char *filename)
{                                
    int ch = 0, lines = 0;
    FILE *fp = fopen(filename, "r");
    if (fp == NULL)
        return 0;

    do {
        ch = fgetc(fp);
        if (ch == '\n')
            lines++;
    } while (ch != EOF);

    if (ch != '\n' && lines != 0)
        lines++;
    fclose(fp);

    return lines;
}

int main(int argc, char *argv[])
{
    FILE *template_file = fopen(argv[1], "r");
    FILE *data_file = fopen(argv[2], "r");

    char buffer_line_template_file[100];
    char buffer_line_data_file[100];


    if (argc != 3)
    {
        perror("You didn't insert all the arguments!\n\n");
        exit(EXIT_FAILURE);
    }

    if (template_file == NULL || data_file == NULL)
    {
        perror("Error while opening the file!\n\n");
        exit(EXIT_FAILURE);
    }

    int counter = 0;
    for (int i = 0; i < countlines(argv[1]); i++)
    {
        fgets(buffer_line_template_file, 100, template_file);

        for (int j = 0; j < countlines(argv[2]); j++)
        {
            fgets(buffer_line_data_file, 100, data_file);

            if (strcmp(buffer_line_template_file, buffer_line_data_file) != 0)
            {
                counter++;
                printf("%d", counter);
            }
        }
    }

    printf("\n\n");

    return 0;
}

Could someone please point me into the right direction ? For testing purposes I created a counter at the end which was a part of a small debug. There should be the print() function


As per @chux answer I got the following simplified code:

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

int main(int argc, char *argv[])
{
    FILE *template_file = fopen(argv[1], "r");
    FILE *data_file = fopen(argv[2], "r");

    char buffer_line_template_file[100];
    char buffer_line_data_file[100];


    if (argc != 3)
    {
        perror("You didn't insert all the arguments!\n\n");
        exit(EXIT_FAILURE);
    }

    if (template_file == NULL || data_file == NULL)
    {
        perror("Error while opening the file!\n\n");
        exit(EXIT_FAILURE);
    }

    while(fgets(buffer_line_template_file, 100, template_file))
    {
        buffer_line_template_file[strcspn(buffer_line_template_file, "\n")] = '\0';

        rewind(data_file);
        while (fgets(buffer_line_data_file, 100, data_file))
        {
            buffer_line_data_file[strcspn(buffer_line_data_file, "\n")] = '\0';

            if (strcmp(buffer_line_template_file, buffer_line_data_file) != 0)
            {
                printf("%s\n", buffer_line_template_file);
            }
        }
    }

    printf("\n\n");

    return 0;
}

The above code is giving me the following output, which is not what is expected:

john
john
john
john
john
john
is
is
is
is
is
the new
the new
the new
the new
the new
the new
guy
guy
guy
guy
guy
guy

Upvotes: 1

Views: 1024

Answers (3)

David C. Rankin
David C. Rankin

Reputation: 84561

You already have a very good answer (and always will from chux), but here is a slightly different approach to the problem. It uses automatic storage to reading file2 into an array of strings and then compares each line in file1 against every line in file2 to determine whether it is unique. You can easily convert the code to dynamically allocate memory, but for sake of complexity that was omitted:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

enum { MAXC = 256, MAXL = 512 };

void file1infile2 (FILE *fp2, FILE *fp1, size_t *n2, size_t *n1);

int main (int argc, char **argv) {

    FILE *fp1 = fopen (argc > 1 ? argv[1] : "file1.txt", "r");
    FILE *fp2 = fopen (argc > 2 ? argv[2] : "file2.txt", "r");
    size_t n1 = 0, n2 = 0;

    if (!fp1 || !fp2) {
        fprintf (stderr, "error: file open failed.\n");
        return 1;
    }

    printf ("\nunique words in file1, not in file 2.\n\n");
    file1infile2 (fp2, fp1, &n2, &n1);
    printf ("\nanalyzed %zu lines in file1 against %zu lines in file2.\n\n",
            n1, n2);

    return 0;
}

void file1infile2 (FILE *fp2, FILE *fp1, size_t *n2, size_t *n1)
{
    char buf[MAXC] = "";
    char f2buf[MAXL][MAXC] = { "" };
    size_t i;
    *n1 = *n2 = 0;

    while (*n2 < MAXL && fgets (buf, MAXC, fp2)) {
        char *np = 0;
        if (!(np = strchr (buf, '\n'))) {
            fprintf (stderr, "error: line exceeds MAXC chars.\n");
            exit (EXIT_FAILURE);
        }
        *np = 0;
        strcpy (f2buf[(*n2)++], buf);
    }

    while (*n1 < MAXL && fgets (buf, MAXC, fp1)) {
        char *np = 0;
        if (!(np = strchr (buf, '\n'))) {
            fprintf (stderr, "error: line exceeds MAXC chars.\n");
            exit (EXIT_FAILURE);
        }
        *np = 0, (*n1)++;

        for (i = 0; i < *n2; i++)
            if (!(strcmp (f2buf[i], buf)))
                goto matched;

        printf ("  %s\n", buf);
        matched:;
    }
}

Look over the code and let me know if you have any questions.

Example Use/Output

$ ./bin/f1inf2 dat/f1 dat/f2

unique words in file1, not in file 2.

  john
  the new
  guy

analyzed 4 lines in file1 against 6 lines in file2.

Upvotes: 1

chux
chux

Reputation: 153498

Problems with OP's code

  1. Imprecise definition of line.

  2. Excessive recalculation

  3. Fuzzy determination of the number of lines in a file.


  1. Unlike string, which has a precise definition in C, reading a line is not so well defined. The primary specificity issue: does a line contain the trailing '\n'. If the first answer is Yes, then does the last text in a file after a '\n' constitute a line? (Excessively long lines are another issue, but let us not deal with that today.)

Thus possibly some lines end with '\n' and others do not, fooling strcmp("dog", "dog\n").

The easiest solution is to read a line until either 1) a '\n' is encountered, 2) EOF occurs or 3) line buffer is full. Then after getting a line, lop off the potential trailing '\n'.

Now all lines code subsequently works with have no '\n'.

fgets(buffer_line_template_file, 100, template_file);
buffer_line_template_file[strcspn(buffer_line_template_file, "\n")] = '\0';
  1. OP's loop is incredible wasteful. Consider a file with 1000 lines. Code will loop, calling 1000 times countlines() (each countlines() call reads 1000 lines) times when one countlines() call would suffice.

    // for (int j = 0; j < countlines(argv[2]); j++)
    int j_limit = countlines(argv[2]);
    for (int j = 0; j < j_limit; j++)
    
  2. There really is no need to count the line anyways, just continue until EOF (fgets() returns NULL). So no need to fix its fuzzy definition. (fuzzy-ness concerns same issues as #1)

    int counter = 0;
    for (fgets(buffer_line_template_file, 100, template_file)) {
      buffer_line_template_file[strcspn(buffer_line_template_file, "\n")] = '\0';
    
      rewind(data_file);
      while ((fgets(buffer_line_data_file, 100, data_file)) {
        buffer_line_data_file[strcspn(buffer_line_data_file, "\n")] = '\0';
    
        if (strcmp(buffer_line_template_file, buffer_line_data_file) != 0) {
          counter++;
          printf("%d", counter);
        }
      }
    }
    

Other simplifications possible - for another day.


FWIW, following counts lines of text allowing the last line in the file to optionally end with a '\n'.

    unsigned long long FileLineCount(FILE *istream) {
      unsigned long long LineCount = 0;
      rewind(istream);
      int previous = '\n';
      int ch;

      while ((ch = fgetc(inf)) != EOF) { 
        if (previous == '\n') LineCount++;
        previous = ch;
      }
      return LineCount;
    }

Note that this function may get a different result that fgets() calls. Consider a file of one line of 150 characters. fgets(..., 100,...) will report 2 lines. FileLineCount() reports 1.

[Edit] Updated code to conform to OP functionality.

    int found = 0;
    while (fgets(buffer_line_data_file, 100, data_file))
    {
        buffer_line_data_file[strcspn(buffer_line_data_file, "\n")] = '\0';

        if (strcmp(buffer_line_template_file, buffer_line_data_file) == 0)
        {
            found = 1;
            break;
        }
    }
    if (!found) printf("%s\n", buffer_line_template_file);

Upvotes: 2

Niklas Rosencrantz
Niklas Rosencrantz

Reputation: 26647

This program prints the diff of two files file1.txt and file2.txt.

#include<stdio.h>
#include <stdlib.h>
#include <memory.h>

int main() {
    FILE *fp1, *fp2;
    int ch1, ch2;
    char fname1[40], fname2[40];

    char *line = NULL;
    size_t len = 0;
    ssize_t read;

    char *line2 = NULL;
    size_t len2 = 0;
    ssize_t read2;

    fp1 = fopen("file1.txt", "r");
    fp2 = fopen("file2.txt", "r");

    if (fp1 == NULL) {
        printf("Cannot open %s for reading ", fname1);
        exit(1);
    } else if (fp2 == NULL) {
        printf("Cannot open %s for reading ", fname2);
        exit(1);
    } else {
        while ((read = getline(&line, &len, fp1)) != -1 && (read2 = getline(&line2, &len2, fp2)) != -1) {
            if (!strcmp(line, line2)) {
                printf("Retrieved diff on line %zu :\n", read);
                printf("%s", line);
            }
        }
        if (ch1 == ch2)
            printf("Files are identical \n");
        else if (ch1 != ch2)
            printf("Files are Not identical \n");

        fclose(fp1);
        fclose(fp2);
    }
    return (0);
}

Upvotes: 1

Related Questions