How do I count the number of words with the same length in a array of strings C

Question

I'm opening and reading a dictionary file and counting how many words are in the file. Then I'm storing each word individually in an array of strings. After that, I sorted the words by length and in alphabetical order, using the funtion qsort(). Right now, I'm trying to access the table and count how many words have the same length, but I'm have some difficulties to decide on how I should proceed next. The code I have written so far is this:

#include 
#include 
#include 
#define MAX_STR 100

/* Sorting the words by length and alphabtichal order,
being legnth priority number one */

int compare(const void *a, const void *b){

    const char **str_a = (const char **)a;
    const char **str_b = (const char **)b;

    int len1 = strlen(*str_a);
    int len2 = strlen(*str_b);
    if (len1 < len2) return -1;
    if (len1 > len2) return  1;

    return strcmp(*str_a, *str_b);
}

int main (int argc, char *argv[]){

    FILE *fp = NULL;
    int i = 0, n_total_palavras = 0;
    char str[MAX_STR];
    int  count = 0;
    char **Words;

    fp = fopen("words.dict", "r");
    if (fp == NULL){
        exit (0);
    }

    while (fscanf(fp,"%s",str) == 1){
        n_total_palavras++;
    }

    Words = (char **)malloc(n_total_palavras * sizeof (char *));
    if (Words == NULL){
        exit(0);
    }

    for (i = 0; i < n_total_palavras; i++){
        Words[i] = NULL;
         
    }

    rewind (fp);
    while (fscanf(fp,"%s",str) == 1){
       Words[count] = (char*)malloc((strlen(str)+1) * sizeof(char));
        strcpy(Words[count], str);
        count++;
    }

    qsort(Words, n_total_palavras, sizeof(Words[0]), compare);

    /* for(i = 0; i < n_total_palavras; i++){
        printf("%s
", Words[i]);
    }
    */



    fclose(fp);
    return 0;
}

I'm trying to obtain something like:

4 letters words: 2018
5 letters words: 170
6 letters words: 10
(...)

Any idea on how I should look at this ?

Jonathan Leffler · Accepted Answer

Here's my code implementing what I suggested. It reads the file once, growing the word list as needed. It allocates about twice as much space as before each time more space is needed.

I've simplified the comparison function marginally — but the optimizer probably gets close to what I've written anyway.

The code is currently configured not to print the sorted word list.

/* SO 7400-7509 */

#include 
#include 
#include 

#define MAX_STR 100

/*
 * Sorting the words first by length and then in alphabetical order
*/

static int compare(const void *a, const void *b)
{
    const char *str_a = *(const char **)a;
    const char *str_b = *(const char **)b;

    int len1 = strlen(str_a);
    int len2 = strlen(str_b);
    if (len1 < len2)
        return -1;
    if (len1 > len2)
        return 1;

    return strcmp(str_a, str_b);
}

int main(int argc, char *argv[])
{
    char str[MAX_STR];
    char **words = 0;
    size_t max_words = 0;
    size_t num_words = 0;
    const char *filename = "words.dict";

    if (argc == 2)
        filename = argv[1];
    else if (argc > 2)
    {
        fprintf(stderr, "Usage: %s [filename]
", argv[0]);
        exit(EXIT_FAILURE);
    }

    FILE *fp = fopen(filename, "r");
    if (fp == NULL)
    {
        fprintf(stderr, "%s: failed to open file '%s' for reading
",
                argv[0], filename);
        exit(EXIT_FAILURE);
    }

    while (fscanf(fp, "%99s", str) == 1)
    {
        if (num_words >= max_words)
        {
            size_t new_size = (max_words + 2) * 2;
            void *new_space = realloc(words, sizeof(words[0]) * new_size);
            if (new_space == NULL)
            {
                fprintf(stderr, "%s: failed to allocate %zu pointers
",
                        argv[0], new_size);
                exit(EXIT_FAILURE);
            }
            words = new_space;
            max_words = new_size;
        }
        words[num_words++] = strdup(str);
    }
    fclose(fp);

    qsort(words, num_words, sizeof(words[0]), compare);

    /*
    for (size_t i = 0; i < num_words; i++)
    {
        printf("%zu: %s
", i+1, words[i]);
    }
    */

     size_t count = 0;
     size_t currlen = strlen(words[0]);
     for (size_t i = 0; i < num_words; i++)
     {
        size_t length = strlen(words[i]);
        if (length == currlen)
            count++;
        else
        {
            printf("%zu-letter words: %zu
", currlen, count);
            currlen = length;
            count = 1;
        }
    }
    printf("%zu-letter words: %zu
", currlen, count);

    return 0;
}

Consider the data file words.dict:

alpha
beta
gamma
delta
epsilon
Hawaii
California
Colorado
Alaska
Alabama
Arizona

It produces the output:

4-letter words: 1
5-letter words: 3
6-letter words: 2
7-letter words: 3
8-letter words: 1
10-letter words: 1

Given a variation on the Linux dictionary (it's been mangled so the words are all mono-case, and all punctuation is removed, and duplicates have been removed, the output was:

$ timecmd -m -- cw31 ~/src/spelling-bee/sb-wordlist
2022-10-09 13:56:34.935 [PID 94595] cw31 /Users/jonathanleffler/src/spelling-bee/sb-wordlist
1-letter words: 26
2-letter words: 566
3-letter words: 4343
4-letter words: 10359
5-letter words: 21884
6-letter words: 38179
7-letter words: 50447
8-letter words: 58182
9-letter words: 57289
10-letter words: 48591
11-letter words: 39357
12-letter words: 30260
13-letter words: 21642
14-letter words: 14585
15-letter words: 9078
16-letter words: 5325
17-letter words: 3046
18-letter words: 1505
19-letter words: 774
20-letter words: 363
21-letter words: 170
22-letter words: 74
23-letter words: 31
24-letter words: 12
25-letter words: 8
27-letter words: 3
28-letter words: 2
29-letter words: 2
31-letter words: 1
45-letter words: 1
2022-10-09 13:56:35.130 [PID 94595; status 0x0000]  -  0.195s
$ wc -l ~/src/spelling-bee/sb-wordlist
  416105 /Users/jonathanleffler/src/spelling-bee/sb-wordlist
$

The 45-letter word for the curious is "pneumonoultramicroscopicsilicovolcanoconiosis".

How do I count the number of words with the same length in a array of strings C

Answers (2)

Related Questions