Reputation: 61

Creating in C code that outputs the number of words, number of distinct words, and most frequent word

I need to write code that reads a text file and outputs the number of words, number of distinct words, and the most frequent word in C.

I have already done the code for outputting the number of words, but I have no idea how to find the number of distinct words or the most frequent word. I know I'm supposed to use strcmp, but I don't know about doing so. Any help would be greatly appreciated. Here's what I have so far.

int main(int argc, char *argv[])
{
    int number=0;
    char temp[25000][50];
    char word[25000][50];
    char *word2[25000][50];
    int wordCount=0;
    int distinctCount=0;
    FILE *fp;

    //reads file!
    fp = fopen("COEN12_LAB1.txt", "r");

    if(fp == NULL)
    {
        printf("File Missing!\n");
        return 0;
    }

    //counts words in file!
    while(fscanf(fp,"%s", word) == 1)
        wordCount++;

    printf("Total number of words: %d\n", wordCount);
    fclose(fp);`
}

Upvotes: 2

Answers (4)

moooeeeep

Reputation: 32522

You could use a simple database to compute the different word counts from the input text. For simplicity I'd suggest to use SQLite. Below I have added some example code (I left out the error handling for the sake of brevity).

For reading words I took an approach to read a single line into a buffer using fgets. I noticed that this approach works nicely as long you can guarantee that the buffer is always large enough to hold the actual lines from the input file. Otherwise words are split up at the end of the buffer, which needs to be handled somehow.

For parsing the text I have used strtok. During the implementation I have learned that it's quite hard to get the word-delimiters right. Besides this, possible spelling differences (e.g., capitalization) and inflections of otherwise equal words are completely ignored by this approach and could thus negatively affect the result.

Once the data is in the database, the query language is very well suited to formulate queries to get the maximum (distinct) word count, or word frequencies. Therefore I think this flexible approach has an advantage when you want to compute multiple statistics from the input text, as you don't have to implement every special case in C. For testing, I copied a part of the Wikipedia article on SQLite into the file words.txt.

Here's the example:

#include <sqlite3.h>
#include <stdio.h>
#include <string.h>

#define DELIM " \r\n\t,.-;:_#+*\\=)(/&%$§\"“”!1234567890}][{'"
#define BUFSIZE 4096
#define SQLSIZE 256

int print_row(void* p, int ncols, char **values, char **names) {
    for (int i = 0; i < ncols; i++)
        printf("| %15s : %15s %s", names[i], values[i], i<ncols-1?"":"|\n");
    return 0;
}

int main(int argc, char * argv[]) {
    /* open infile */
    FILE * infile = fopen("words.txt", "r");
    /* initialize database */
    sqlite3 *db_handle = 0;
    sqlite3_open(":memory:", &db_handle);
    sqlite3_exec(db_handle, "CREATE TABLE word (word TEXT);", 0, 0, 0);
    /* parse file, populate db */
    char buf[BUFSIZE], sql[SQLSIZE], *word;
    while(fgets(buf, BUFSIZE, infile))
        for (word = strtok(buf, DELIM); word != 0; word = strtok(0, DELIM)) {
            snprintf(sql, SQLSIZE, "INSERT INTO word VALUES ('%s');", word);
            sqlite3_exec(db_handle, sql, 0, 0, 0);
        }
    /* count of words */
    sqlite3_exec(db_handle,
                 "SELECT COUNT(word) AS total_words FROM word;",
                 print_row, 0, 0);
    /* count of distinct words */
    sqlite3_exec(db_handle,
                 "SELECT COUNT(DISTINCT word) AS distinct_words FROM word;",
                 print_row, 0, 0);
    /* top five most frequent words */
    sqlite3_exec(db_handle,
                 "SELECT word, COUNT(*) AS count FROM word "
                 "GROUP BY word ORDER BY count DESC LIMIT 5;",
                 print_row, 0, 0);
    sqlite3_close(db_handle);
}

Here's my output:

$ gcc test.c -std=c99 -lsqlite3 && ./a.out 
|     total_words :             561 |
|  distinct_words :             314 |
|            word :          SQLite |           count :              17 |
|            word :              is |           count :              16 |
|            word :             the |           count :              15 |
|            word :               a |           count :              14 |
|            word :              to |           count :              12 |

For reference:

Upvotes: 0

ryyker

Reputation: 23218

[EDIT]
1. replaced malloc with calloc (initializes memory to 0)
2. replaced second argument in qsort
3. program now works with wider range of files (more words, more delimiters)

This is not pretty, and may need some minor debug, but it will get you started for count, number of distinct and most frequently used words:

#include <ansi_c.h>
#include <stdio.h>

#define FILENAME "c:\\dev\\play\\test3.txt" //put your own path here
#define DELIM "- .,:;//_*&\n"

int longestWord(char *file, int *cnt);
void allocMemory(int numStrings, int max);
void freeMemory(int numStrings);

static int sortstring( const void *str1, const void *str2 );

char **strings;

int main()
{
    int wc, longest, cnt, distinct, i, mostFreq, mostFreqKeep=0;
    char line[260];
    char *buf=0;
    FILE *fp;

    longest = longestWord(FILENAME, &wc);

    char wordKeep[longest];
    allocMemory(wc, longest);

    //read file into string arrays 
    fp = fopen(FILENAME, "r");
    cnt=0;
    while(fgets(line, 260, fp))
    {
        buf = strtok(line, DELIM);
        while(buf)  
        {
            if((strlen(buf) > 0) && (buf[0] != '\t') && (buf[0] != '\n') && (buf[0] != '\0')&& (buf[0] > 0))
            {
                strcpy(strings[cnt], buf);
                cnt++; //use as accurate count of words.
            }
            buf = strtok(NULL, DELIM);
        }
    }
    fclose(fp);
    //now get most frequent word
    //sort
    qsort(strings, cnt, sizeof(char*), sortstring);
    distinct = 1;
    mostFreq = 1; //every word will occur once
    wordKeep[0]=0;
    for(i=0;i<cnt-1;i++)
    {
        //depends on a successful sort (alphabetization)
        if(strlen(strings[i]) >0)
        {
            if(strcmp(strings[i], strings[i+1]) == 0)
            {
                mostFreq++;
                if(mostFreq > mostFreqKeep) 
                {
                    strcpy(wordKeep, strings[i]);
                    mostFreqKeep = mostFreq;
                }
            }
            else
            {
                mostFreq = 1;
                distinct++; 
            }
        }
    }
    printf("number of words: %d\nNumber of distinct words:%d\nmost frequent word: %s - %d\n", cnt, distinct, wordKeep, mostFreqKeep);

    freeMemory(cnt);
    getchar();
    return 0;
}

int longestWord(char *file, int *nWords)
{
    FILE *fp;
    int cnt=0, longest=0, numWords=0;
    char c;
    fp = fopen(file, "r");
    while ( (c = fgetc ( fp) ) != EOF )
    {
        if ( isalpha ( c ) ) cnt++;
        else if ( ( ispunct ( c ) ) || ( isspace ( c ) ) )
        {
            (cnt > longest) ? (longest = cnt, cnt=0) : (cnt=0);
            numWords++;
        }
    }
    *nWords = numWords;
    fclose(fp);
    return longest+1;
}

void allocMemory(int numStrings, int max)
{
    int i;
    strings = calloc(sizeof(char*)*(numStrings+1), sizeof(char*));
    for(i=0;i<numStrings; i++)
    {
      strings[i] = calloc(sizeof(char)*max + 1, sizeof(char));
    }
}

void freeMemory(int numStrings)
{
    int i;
    for(i=0;i<numStrings; i++)
        if(strings[i]) free(strings[i]);
    free(strings);  
}

static int sortstring( const void *str1, const void *str2 )
{
    const char *rec1 = *(const char**)str1;
    const char *rec2 = *(const char**)str2;
    int val = strcmp(rec1, rec2);

    return val;
}

Upvotes: 0

olegarch

Reputation: 3891

I wrote program for you, see source here: http://olegh.cc.st/src/words.c.txt Of course, did not check special situations, like many words on single line, distinct words qty > 16,000, etc. But, basic code you can get:

Run sample:

$ cat aaa.txt
aaa
bbb
ccc
aaa
xxx
aaa
cc

$ cc words.c ; ./a.out aaa.txt
   1    xxx
   1    ccc
   1    bbb
   1    cc
   3    aaa

Upvotes: 0

Roman Nikitchenko

Reputation: 13046

First you probably need to implement structure that will allow you to efficiently keep distinct words. Hash table is one of the possible ones (maybe best).

Here is example of implementation and usage of hashes on C:

Also you can look into this question: Porting std::map to C?

Upvotes: 1

Creating in C code that outputs the number of words, number of distinct words, and most frequent word

Answers (4)

Related Questions