Reputation: 372
I try to split one string to 3-gram strings. But turns out that the resulting substrings were always messy. The length and char ** input... are needed, since I will use them as args later for python calling the funxtion.
This is the function I wrote.
struct strArrIntArr getSearchArr(char* input, int length) {
struct strArrIntArr nameIndArr;
// flag of same bit
int same;
// flag/index of identical strings
int flag = 0;
// how many identical strings
int num = 0;
// array of split strings
char** nameArr = (char **)malloc(sizeof(char *) * (length - 2));
if ( nameArr == NULL ) exit(0);
// numbers of every split string
int* valueArr = (int* )malloc(sizeof(int) * (length-2));
if ( valueArr == NULL ) exit(0);
// loop length of search string -2 times (3-gram)
for(int i = 0; i<length-2; i++){
if(flag==0){
nameArr[i - num] = (char *)malloc(sizeof(char) * 3);
if ( nameArr[i - num] == NULL ) exit(0);
printf("----i------------%d------\n", i);
printf("----i-num--------%d------\n", i-num);
}
flag = 0;
// compare splitting string with existing split strings,
// if a string exists, it would not be stored
for(int k=0; k<i-num; k++){
same = 0;
for(int j=0; j<3; j++){
if(input[i + j] == nameArr[k][j]){
same ++;
}
}
// identical strings found, if all the three bits are the same
if(same == 3){
flag = k;
num++;
break;
}
}
// if the current split string doesn't exist yet
// put current split string to array
if(flag == 0){
for(int j=0; j<3; j++){
nameArr[i-num][j] = input[i + j];
valueArr[i-num] = 1;
}
}else{
valueArr[flag]++;
}
printf("-----string----%s\n", nameArr[i-num]);
}
// number of N-gram strings
nameIndArr.length = length- 2- num;
// array of N-gram strings
nameIndArr.charArr = nameArr;
nameIndArr.intArr = valueArr;
return nameIndArr;
}
To call the function:
int main(int argc, const char * argv[]) {
int length = 30;
char* input = (char *)malloc(sizeof(char) * length);
input = "googleapis.com.wncln.wncln.org";
// split the search string into N-gram strings
// and count the numbers of every split string
struct strArrIntArr nameIndArr = getSearchArr(input, length);
}
Below is the result. The strings from 17 are messy.
----i------------0------
----i-num--------0------
-----string----goo
----i------------1------
----i-num--------1------
-----string----oog
----i------------2------
----i-num--------2------
-----string----ogl
----i------------3------
----i-num--------3------
-----string----gle
----i------------4------
----i-num--------4------
-----string----lea
----i------------5------
----i-num--------5------
-----string----eap
----i------------6------
----i-num--------6------
-----string----api
----i------------7------
----i-num--------7------
-----string----pis
----i------------8------
----i-num--------8------
-----string----is.
----i------------9------
----i-num--------9------
-----string----s.c
----i------------10------
----i-num--------10------
-----string----.co
----i------------11------
----i-num--------11------
-----string----com
----i------------12------
----i-num--------12------
-----string----om.
----i------------13------
----i-num--------13------
-----string----m.w
----i------------14------
----i-num--------14------
-----string----.wn
----i------------15------
----i-num--------15------
-----string----wnc
---i------------16------
----i-num--------16------
-----string----ncl
----i------------17------
----i-num--------17------
-----string----clnsole
----i------------18------
----i-num--------18------
-----string----ln.=C:
----i------------19------
----i-num--------19------
-----string----n.wgram 馻绚s
----i------------20------
----i-num--------20------
-----string----n.wgram 馻绚s
-----string----n.wgram 馻绚s
-----string----n.wgram 馻绚s
-----string----n.wgram 馻绚s
-----string----n.wgram 馻绚s
-----string----n.oiles(騛窑=
----i------------26------
----i-num--------21------
-----string----.orSModu鯽蓼t
----i------------27------
----i-num--------22------
-----string----org
under win10, codeblocks 17.12, gcc 8.1.0
Upvotes: 3
Views: 85
Reputation: 9855
This answer tries to fix the existing code instead of proposing alternative/better solutions.
After fixing the output
printf("-----string----%s\n", nameArr[i-num]);
in the question, there is still another important problem.
You want to store 3 characters in nameArr[i-num]
and allocate space for 3 characters. Later you print is as a string in the code shown above. This requires a trailing '\0'
after the 3 characters, so you have to allocate memory for 4 characters and either append a '\0'
or initialize the allocated memory with 0. Using calloc
instead of malloc
would automatically initialize the memory to 0.
Here is a modified version of the source code
I also changed the initialization of the string value and its length in main()
to avoid the memory leak.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
struct strArrIntArr {
int length;
char **charArr;
int *intArr;
};
struct strArrIntArr getSearchArr(char* input, int length) {
struct strArrIntArr nameIndArr;
// flag of same bit
int same;
// flag/index of identical strings
int flag = 0;
// how many identical strings
int num = 0;
// array of split strings
char** nameArr = (char **)malloc(sizeof(char *) * (length - 2));
if ( nameArr == NULL ) exit(0);
// numbers of every split string
int* valueArr = (int* )malloc(sizeof(int) * (length-2));
if ( valueArr == NULL ) exit(0);
// loop length of search string -2 times (3-gram)
for(int i = 0; i<length-2; i++){
if(flag==0){
nameArr[i - num] = (char *)malloc(sizeof(char) * 4);
if ( nameArr[i - num] == NULL ) exit(0);
printf("----i------------%d------\n", i);
printf("----i-num--------%d------\n", i-num);
}
flag = 0;
// compare splitting string with existing split strings,
// if a string exists, it would not be stored
for(int k=0; k<i-num; k++){
same = 0;
for(int j=0; j<3; j++){
if(input[i + j] == nameArr[k][j]){
same ++;
}
}
// identical strings found, if all the three bits are the same
if(same == 3){
flag = 1;
num++;
break;
}
}
// if the current split string doesn't exist yet
// put current split string to array
if(flag == 0){
for(int j=0; j<3; j++){
nameArr[i-num][j] = input[i + j];
valueArr[i-num] = 1;
}
nameArr[i-num][3] = '\0';
}else{
valueArr[flag]++;
}
printf("-----string----%s\n", nameArr[i-num]);
}
// number of N-gram strings
nameIndArr.length = length- 2- num;
// array of N-gram strings
nameIndArr.charArr = nameArr;
nameIndArr.intArr = valueArr;
return nameIndArr;
}
int main(int argc, const char * argv[]) {
int length;
char* input = strdup("googleapis.com.wncln.wncln.org");
length = strlen(input);
// split the search string into N-gram strings
// and count the numbers of every split string
struct strArrIntArr nameIndArr = getSearchArr(input, length);
}
This other answer contains more improvements which I personally would prefer over the modified original solution.
Upvotes: 0
Reputation: 29126
You are making life complicated for you in several places:
num
the count of duplicates, make it the count of unique trigraphs.Here's an alternative implementation:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
struct tri {
char str[4]; // trigraph: 3 chars and NUL
int count; // count of occurrences
};
struct stat {
struct tri *tri; // list of trigraphs with counts
int size; // number of trigraphs
};
/*
* Find string 'key' in list of trigraphs. Return the index
* or in the array or -1 if it isn't found.
*/
int find_trigraph(const struct tri *tri, int n, const char *key)
{
for (int i = 0; i < n; i++) {
int j = 0;
while (j < 3 && tri[i].str[j] == key[j]) j++;
if (j == 3) return i;
}
return -1;
}
/*
* Create an array of trigraphs from the input string.
*/
struct stat getSearchArr(char* input, int length)
{
int num = 0;
struct tri *tri = malloc(sizeof(*tri) * (length - 2));
for(int i = 0; i < length - 2; i++) {
int index = find_trigraph(tri, num, input + i);
if (index < 0) {
snprintf(tri[num].str, 4, "%.3s", input + i); // see [1]
tri[num].count = 1;
num++;
} else {
tri[index].count++;
}
}
for(int i = 0; i < num; i++) {
printf("#%d %s: %d\n", i, tri[i].str, tri[i].count);
}
struct stat stat = { tri, num };
return stat;
}
/*
* Driver code
*/
int main(void)
{
char *input = "googleapis.com.wncln.wncln.org";
int length = strlen(input);
struct stat stat = getSearchArr(input, length);
// ... do stuff with stat ...
free(stat.tri);
return 0;
}
Footnote 1: I find that snprintf(str, n, "%.*s", len, str + offset)
is useful for copying substrings: The result will not overflow the buffer and it will be null-terminated. There really ought to be a stanard function for this, but strcpy
may overflow and strncpy
may leave the buffer unterminated.
Upvotes: 2