heisthere
heisthere

Reputation: 372

Try to split string but got messy substrings

I try to split one string to 3-gram strings. But turns out that the resulting substrings were always messy. The length and char ** input... are needed, since I will use them as args later for python calling the funxtion.

This is the function I wrote.

struct strArrIntArr getSearchArr(char* input, int length) {

    struct strArrIntArr nameIndArr;
    // flag of same bit
    int same;
    // flag/index of identical strings
    int flag = 0;
    // how many identical strings
    int num = 0;
    // array of split strings
    char** nameArr = (char **)malloc(sizeof(char *) * (length - 2));
    if ( nameArr == NULL ) exit(0);
    // numbers of every split string
    int* valueArr = (int* )malloc(sizeof(int) * (length-2));
    if ( valueArr == NULL ) exit(0);

    // loop length of search string -2 times (3-gram)
    for(int i = 0; i<length-2; i++){
        if(flag==0){
            nameArr[i - num] = (char *)malloc(sizeof(char) * 3);
            if ( nameArr[i - num] == NULL ) exit(0);
            printf("----i------------%d------\n", i);
            printf("----i-num--------%d------\n", i-num);
        }
        flag = 0;
        // compare splitting string with existing split strings,
        // if a string exists, it would not be stored
        for(int k=0; k<i-num; k++){
            same = 0;
            for(int j=0; j<3; j++){
                if(input[i + j] == nameArr[k][j]){
                    same ++;
                }
            }
            // identical strings found, if all the three bits are the same
            if(same == 3){
                flag = k;
                num++;
                break;
            }
        }
        // if the current split string doesn't exist yet
        // put current split string to array
        if(flag == 0){
            for(int j=0; j<3; j++){
                nameArr[i-num][j] = input[i + j];
                valueArr[i-num] = 1;
            }
        }else{
            valueArr[flag]++;
        }
        printf("-----string----%s\n", nameArr[i-num]);
    }

    // number of N-gram strings
    nameIndArr.length = length- 2- num;
    // array of N-gram strings
    nameIndArr.charArr = nameArr;
    nameIndArr.intArr = valueArr;
    return nameIndArr;
}

To call the function:

int main(int argc, const char * argv[]) {
    int length = 30;
    char* input = (char *)malloc(sizeof(char) * length);
    input = "googleapis.com.wncln.wncln.org";

    // split the search string into N-gram strings
    // and count the numbers of every split string
    struct strArrIntArr nameIndArr = getSearchArr(input, length);
}

Below is the result. The strings from 17 are messy.

----i------------0------
----i-num--------0------
-----string----goo

----i------------1------
----i-num--------1------
-----string----oog

----i------------2------
----i-num--------2------
-----string----ogl

----i------------3------
----i-num--------3------
-----string----gle

----i------------4------
----i-num--------4------
-----string----lea

----i------------5------
----i-num--------5------
-----string----eap

----i------------6------
----i-num--------6------
-----string----api

----i------------7------
----i-num--------7------
-----string----pis

----i------------8------
----i-num--------8------
-----string----is.

----i------------9------
----i-num--------9------
-----string----s.c

----i------------10------
----i-num--------10------
-----string----.co

----i------------11------
----i-num--------11------
-----string----com

----i------------12------
----i-num--------12------
-----string----om.

----i------------13------
----i-num--------13------
-----string----m.w

----i------------14------
----i-num--------14------
-----string----.wn

----i------------15------
----i-num--------15------
-----string----wnc

---i------------16------
----i-num--------16------
-----string----ncl

----i------------17------
----i-num--------17------
-----string----clnsole

----i------------18------
----i-num--------18------
-----string----ln.=C:

----i------------19------
----i-num--------19------
-----string----n.wgram 馻绚s

----i------------20------
----i-num--------20------
-----string----n.wgram 馻绚s

-----string----n.wgram 馻绚s

-----string----n.wgram 馻绚s

-----string----n.wgram 馻绚s

-----string----n.wgram 馻绚s

-----string----n.oiles(騛窑=

----i------------26------
----i-num--------21------
-----string----.orSModu鯽蓼t

----i------------27------
----i-num--------22------
-----string----org

under win10, codeblocks 17.12, gcc 8.1.0

Upvotes: 3

Views: 85

Answers (2)

Bodo
Bodo

Reputation: 9855

This answer tries to fix the existing code instead of proposing alternative/better solutions.

After fixing the output

printf("-----string----%s\n", nameArr[i-num]);

in the question, there is still another important problem.

You want to store 3 characters in nameArr[i-num] and allocate space for 3 characters. Later you print is as a string in the code shown above. This requires a trailing '\0' after the 3 characters, so you have to allocate memory for 4 characters and either append a '\0' or initialize the allocated memory with 0. Using calloc instead of malloc would automatically initialize the memory to 0.

Here is a modified version of the source code

I also changed the initialization of the string value and its length in main() to avoid the memory leak.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>


struct strArrIntArr {
    int length;
    char **charArr;
    int *intArr;
};

struct strArrIntArr getSearchArr(char* input, int length) {

    struct strArrIntArr nameIndArr;
    // flag of same bit
    int same;
    // flag/index of identical strings
    int flag = 0;
    // how many identical strings
    int num = 0;
    // array of split strings
    char** nameArr = (char **)malloc(sizeof(char *) * (length - 2));
    if ( nameArr == NULL ) exit(0);
    // numbers of every split string
    int* valueArr = (int* )malloc(sizeof(int) * (length-2));
    if ( valueArr == NULL ) exit(0);

    // loop length of search string -2 times (3-gram)
    for(int i = 0; i<length-2; i++){
        if(flag==0){
            nameArr[i - num] = (char *)malloc(sizeof(char) * 4);
            if ( nameArr[i - num] == NULL ) exit(0);
            printf("----i------------%d------\n", i);
            printf("----i-num--------%d------\n", i-num);
        }
        flag = 0;
        // compare splitting string with existing split strings,
        // if a string exists, it would not be stored
        for(int k=0; k<i-num; k++){
            same = 0;
            for(int j=0; j<3; j++){
                if(input[i + j] == nameArr[k][j]){
                    same ++;
                }
            }
            // identical strings found, if all the three bits are the same
            if(same == 3){
                flag = 1;
                num++;
                break;
            }
        }
        // if the current split string doesn't exist yet
        // put current split string to array
        if(flag == 0){
            for(int j=0; j<3; j++){
                nameArr[i-num][j] = input[i + j];
                valueArr[i-num] = 1;
            }
            nameArr[i-num][3] = '\0';
        }else{
            valueArr[flag]++;
        }
        printf("-----string----%s\n", nameArr[i-num]);
    }

    // number of N-gram strings
    nameIndArr.length = length- 2- num;
    // array of N-gram strings
    nameIndArr.charArr = nameArr;
    nameIndArr.intArr = valueArr;
    return nameIndArr;
}


int main(int argc, const char * argv[]) {
    int length;
    char* input = strdup("googleapis.com.wncln.wncln.org");
    length = strlen(input);

    // split the search string into N-gram strings
    // and count the numbers of every split string
    struct strArrIntArr nameIndArr = getSearchArr(input, length);
}

This other answer contains more improvements which I personally would prefer over the modified original solution.

Upvotes: 0

M Oehm
M Oehm

Reputation: 29126

You are making life complicated for you in several places:

  • Don't count backwards: Instead of making num the count of duplicates, make it the count of unique trigraphs.
  • Scope variable definitions in functions as closely as possible. You have several uninitialized variables. You have declared them at the start of the function, but you need them only in local blocks.
  • Initialize as soon as you allocate. In your code, you use a flag to determine whather to create a new string. The code to allocate he string and to initialize it are in different blocks. Those blocks have the same flag as condition, but the flag is updated in between. This could lead to asynchronities, even to bugs when you try to initialize memory that wasn't allocated.
  • It's probably better to keep the strings and their counts together in a struct. If anything, this will help you with sorting later. This also offers some simplification: Instead of allocating chunks of 3 bytes, keep a char array of four bytes in the struct, so that all entries can be properly null-terminated. Those don't need to be allocated separately.

Here's an alternative implementation:

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

struct tri {
    char str[4];        // trigraph: 3 chars and NUL
    int count;          // count of occurrences
};

struct stat {
    struct tri *tri;    // list of trigraphs with counts
    int size;           // number of trigraphs
};

/*
 *      Find string 'key' in list of trigraphs. Return the index
 *      or in the array or -1 if it isn't found.
 */
int find_trigraph(const struct tri *tri, int n, const char *key)
{
    for (int i = 0; i < n; i++) {
        int j = 0;

        while (j < 3 && tri[i].str[j] == key[j]) j++;
        if (j == 3) return i;
    }

    return -1;
}

/*
 *      Create an array of trigraphs from the input string.
 */
struct stat getSearchArr(char* input, int length)
{
    int num = 0;

    struct tri *tri = malloc(sizeof(*tri) * (length - 2));

    for(int i = 0; i < length - 2; i++) {
        int index = find_trigraph(tri, num, input + i);

        if (index < 0) {
            snprintf(tri[num].str, 4, "%.3s", input + i);       // see [1]
            tri[num].count = 1;             
            num++;
        } else {
            tri[index].count++;
        }
    }


    for(int i = 0; i < num; i++) {
        printf("#%d %s: %d\n", i, tri[i].str, tri[i].count);
    }

    struct stat stat = { tri, num };

    return stat;
}

/*
 *      Driver code
 */
int main(void)
{
    char *input = "googleapis.com.wncln.wncln.org";
    int length = strlen(input);

    struct stat stat = getSearchArr(input, length);

    // ... do stuff with stat ...

    free(stat.tri);

    return 0;
}

Footnote 1: I find that snprintf(str, n, "%.*s", len, str + offset) is useful for copying substrings: The result will not overflow the buffer and it will be null-terminated. There really ought to be a stanard function for this, but strcpy may overflow and strncpy may leave the buffer unterminated.

Upvotes: 2

Related Questions