rohan843
rohan843

Reputation: 141

How to reduce time complexity in traversing a string?

I was solving a problem to find number of such indexes, a, b, c, d in a string s, of size n made only of lowercase letters such that:

1 <= a < b < c < d <= n

and

s[a] == s[c] and s[b] == s[d]

The code I wrote traverses the string character by character in a basic manner:

#include<stdio.h>

int main()
{
    int n, count = 0;
    char s[2002];
    scanf("%d%s", &n, s);
    for(int a = 0; a<n-3; a++)
    {
        for(int b = a + 1; b<n-2; b++)
        {
            for(int c = b + 1; c<n-1; c++)
            {
                for(int d = c + 1; d<n; d++)
                {
                    if(s[a] == s[c] && s[b] == s[d] && a>=0 && b>a && c>b && d>c && d<n)
                    {
                        count++;
                    }
                }
            }
        }
    }
    printf("%d", count);
    return 0;
}

a, b, c and d are the indices. The trouble is that if the input string is big in size, the time limit is exceeded due to the 4 nested loops. Is there any way I can improve the code to decrease the complexity?

The problem statement is available here: https://www.hackerearth.com/practice/algorithms/searching/linear-search/practice-problems/algorithm/holiday-season-ab957deb/

Upvotes: 4

Views: 809

Answers (6)

Orielno
Orielno

Reputation: 409

In the worst-case scenario, the whole string contains the same character, and in this case every indexes such that 1 <= a < b < c < d <= N will satisfy s[a] == s[c] && s[b] == s[d], hence the counter would add up to n*(n-1)*(n-2)*(n-3) / 4!, which is O(n^4). In other words, assuming the counting process is one-by-one (using counter++), there is no way to make the worst-case time complexity better than O(n^4).

Having that said, this algorithm can be improved. One possible and very important improvement, is that if s[a] != s[c], there is no point in continuing to check all possible indexes b and d. user3777427 went in this direction, and it can be further improved like this:

for(int a = 0; a < n-3; a++)
{
    for(int c = a + 2; c < n-1; c++)
    {
        if(s[a] == s[c])
        {
            for(int b = a + 1; b < c; b++)
            {
                for(int d = c + 1; d < n; d++)
                {
                    if(s[b] == s[d])
                    {
                        count++;
                    }
                }
            }
        }
    }
}

Edit:

After some more thought, I have found a way to reduce to worst-cast time complexity to O(n^3), by using a Histogram.

First, we go over the char array once and fill up the Histogram, such that index 'a' in the Histogram will contain the number of occurences of 'a', index 'b' in the Histogram will contain the number of occurences of 'b', etc.

Then, we use the Histogram to eliminate the need for the most inner loop (the d loop), like this:

int histogram1[256] = {0};
for (int i = 0; i < n; ++i)
{
    ++histogram1[(int) s[i]];
}

int histogram2[256];

for(int a = 0; a < n-3; a++)
{
    --histogram1[(int) s[a]];
    
    for (int i = 'a'; i <= 'z'; ++i)
    {
        histogram2[i] = histogram1[i];
    }

    --histogram2[(int) s[a+1]];

    for (int c = a + 2; c < n-1; c++)
    {
        --histogram2[(int) s[c]];

        for (int b = a + 1; b < c; b++)
        {
            if (s[a] == s[c])
            {
                count += histogram2[(int) s[b]];
            }
        }
    }
}

Upvotes: 1

Eric Postpischil
Eric Postpischil

Reputation: 224596

Here is an O(n) solution (counting the number of characters in the allowed character set as constant).

#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>


/*  As used in this program, "substring" means a string that can be formed by
    characters from another string.  The resulting characters are not
    necessarily consecutive in the original string.  For example, "ab" is a
    substring of "xaxxxxbxx".

    This program requires the lowercase letters to have consecutive codes, as
    in ASCII.
*/


#define Max   2000      //  Maximum string length supported.
typedef short     T1;   //  A type that can hold Max.
typedef int       T2;   //  A type that can hold Max**2.
typedef long      T3;   //  A type that can hold Max**3.
typedef long long T4;   //  A type that can hold Max**4.
#define PRIT4 "lld"     //  A conversion specification that will print a T4.

#define L   ('z'-'a'+1) //  Number of characters in the set allowed.


/*  A Positions structure records all positions of a character in the string.
    N is the number of appearances, and Position[i] is the position (index into
    the string) of the i-th appearance, in ascending order.
*/
typedef struct { T1 N, Position[Max]; } Positions;


/*  Return the number of substrings "aaaa" that can be formed from "a"
    characters in the positions indicated by A.
*/
static T4 Count1(const Positions *A)
{
    T4 N = A->N;
    return N * (N-1) * (N-2) * (N-3) / (4*3*2*1);
}


/*  Return the number of substrings "abab" that can be formed from "a"
    characters in the positions indicated by A and "b" characters in the
    positions indicated by B.  A and B must be different.
*/
static T4 Count2(const Positions *A, const Positions *B)
{
    //  Exit early for trivial cases.
    if (A->N < 2 || B->N < 2)
        return 0;

    /*  Sum[i] will record the number of "ab" substrings that can be formed
        with a "b" at the position in B->Position[b] or earlier.
    */
    T2 Sum[Max];

    T3 RunningSum = 0;

    /*  Iterate b through the indices of B->Position.  While doing this, a is
        synchronized to index to a corresponding place in A->Position.
    */
    for (T1 a = 0, b = 0; b < B->N; ++b)
    {
        /*  Advance a to index into A->Position where where A->Position[i]
            first exceeds B->Position[b], or to the end if there is no such
            spot.
        */
        while (a < A->N && A->Position[a] < B->Position[b])
            ++a;

        /*  The number of substrings "ab" that can be formed using the "b" at
            position B->Position[b] is a, the number of "a" preceding it.
            Adding this to RunningSum produces the number of substrings "ab"
            that can be formed using this "b" or an earlier one.
        */
        RunningSum += a;

        //  Record that.
        Sum[b] = RunningSum;
    }

    RunningSum = 0;

    /*  Iterate a through the indices of A->Position.  While doing this, b is
        synchronized to index to a corresponding place in B->Position.
    */
    for (T1 a = 0, b = 0; a < A->N; ++a)
    {
        /*  Advance b to index into B->Position where where B->Position[i]
            first exceeds A->Position[a], or to the end if there is no such
            spot.
        */
        while (b < B->N && B->Position[b] < A->Position[a])
            ++b;

        /*  The number of substrings "abab" that can be formed using the "a"
            at A->Position[a] as the second "a" in the substring is the number
            of "ab" substrings that can be formed with a "b" before the this
            "a" multiplied by the number of "b" after this "a".

            That number of "ab" substrings is in Sum[b-1], if 0 < b.  If b is
            zero, there are no "b" before this "a", so the number is zero.

            The number of "b" after this "a" is B->N - b.
        */
        if (0 < b) RunningSum += (T3) Sum[b-1] * (B->N - b);
    }

    return RunningSum;
}


int main(void)
{
    //  Get the string length.
    size_t length;
    if (1 != scanf("%zu", &length))
    {
        fprintf(stderr, "Error, expected length in standard input.\n");
        exit(EXIT_FAILURE);
    }

    //  Skip blanks.
    int c;
    do
        c = getchar();
    while (c != EOF && isspace(c));
    ungetc(c, stdin);

    /*  Create an array of Positions, one element for each character in the
        allowed set.
    */
    Positions P[L] = {{0}};

    for (size_t i = 0; i < length; ++i)
    {
        c = getchar();
        if (!islower(c))
        {
            fprintf(stderr,
"Error, malformed input, expected only lowercase letters in the string.\n");
            exit(EXIT_FAILURE);
        }
        c -= 'a';
        P[c].Position[P[c].N++] = i;
    }

    /*  Count the specified substrings.  i and j are iterated through the
        indices of the allowed characters.  For each pair different i and j, we
        count the number of specified substrings that can be performed using
        the character of index i as "a" and the character of index j as "b" as
        described in Count2.  For each pair where i and j are identical, we
        count the number of specified substrings that can be formed using the
        character of index i alone.
    */
    T4 Sum = 0;
    for (size_t i = 0; i < L; ++i)
        for (size_t j = 0; j < L; ++j)
            Sum += i == j
                ? Count1(&P[i])
                : Count2(&P[i], &P[j]);

    printf("%" PRIT4 "\n", Sum);
}

Upvotes: 1

Anuj
Anuj

Reputation: 1665

The problem can be solved if you maintain an array which stores the cumulative frequency (the total of a frequency and all frequencies so far in a frequency distribution) of each character in the input string. Since the string will only consist of lower case characters, hence the array size will be [26][N+1].

For example:

index  - 1 2 3 4 5
string - a b a b a

cumulativeFrequency array:

    0  1  2  3  4  5
a   0  1  1  2  2  3
b   0  0  1  1  2  2

I have made the array by taking the index of first character of the input string as 1. Doing so will help us in solving the problem later. For now, just ignore column 0 and assume that the string starts from index 1 and not 0.


Useful facts

Using cumulative frequency array we can easily check if a character is present at any index i:

if cumulativeFrequency[i]-cumulativeFrequency[i-1] > 0

number of times a character is present from range i to j (excluding both i and j):

frequency between i and j =  cumulativeFrequency[j-1] - cumulativeFrequency[i]

Algorithm

1: for each character from a-z:
2:     Locate index a and c such that charAt[a] == charAt[c]
3:     for each pair (a, c):
4:         for character from a-z:
5:             b = frequency of character between a and c
6:             d = frequency of character after c
7:             count += b*d 

Time complexity

Line 1-2:

The outer most loop will run for 26 times. We need to locate all the pair(a, c), to do that we require a time complexity of O(n^2).

Line 3-4:

For each pair, we again run a loop 26 times to check how many times each character is present between a and c and after c.

Line 5-7:

Using cumulative frequency array, for each character we can easily calculate how many times it appears between a and c and after c in O(1).

Hence, overall complexity is O(26*n^2*26) = O(n^2).


Code

I code in Java. I do not have a code in C. I have used simple loops an array so it should be easy to understand.

//Input N and string 
//Do not pay attention to the next two lines since they are basically taking 
//input using Java input streams
int N = Integer.parseInt(bufferedReader.readLine().trim());
String str = bufferedReader.readLine().trim();

//Construct an array to store cumulative frequency of each character in the string
int[][] cumulativeFrequency = new int[26][N+1];

//Fill the cumulative frequency array
for (int i = 0;i < str.length();i++)
{
    //character an index i
    char ch = str.charAt(i);

    //Fill the cumulative frequency array for each character 
    for (int j = 0;j < 26;j++)
    {
        cumulativeFrequency[j][i+1] += cumulativeFrequency[j][i];
        if (ch-97 == j) cumulativeFrequency[j][i+1]++;
    }
}

int a, b, c, d;
long count = 0;

//Follow the steps of the algorithm here
for (int i = 0;i < 26;i++)
{
    for (int j = 1; j <= N - 2; j++)
    {
        //Check if character at i is present at index j
        a = cumulativeFrequency[i][j] - cumulativeFrequency[i][j - 1];

        if (a > 0)
        {
            //Check if character at i is present at index k
            for (int k = j + 2; k <= N; k++)
            {
                c = cumulativeFrequency[i][k] - cumulativeFrequency[i][k - 1];

                if (c > 0)
                {
                    //For each character, find b*d
                    for (int l = 0; l < 26; l++)
                    {
                        //For each character calculate b and d
                        b = cumulativeFrequency[l][k-1] - cumulativeFrequency[l][j];
                        d = cumulativeFrequency[l][N] - cumulativeFrequency[l][k];

                        count += b * d;
                        }
                    }
                }
            }
        }
    }

    System.out.println(count);

I hope I have helped you. The code I provided will not give time complexity error and it will work for all test cases. Do comment if you do not understand anything in my explanation.

Upvotes: 3

John Bollinger
John Bollinger

Reputation: 181932

Problem

It is perhaps useful for thinking about the problem to recognize that it is an exercise in counting overlapping intervals. For example, if we view each pair of the same characters in the input as marking the endpoints of a half-open interval, then the question is asking to count the number of pairs of intervals that overlap without one being a subset of the other.

Algorithm

One way to approach the problem would begin by identifying and recording all the intervals. It is straightforward to do this in a way that allows the intervals to be grouped by left endpoint and ordered by right endpoint within each group -- this falls out easily from a naive scan of the input with a two-level loop nest.

Such an organization of the intervals is convenient both for reducing the search space for overlaps and for more efficiently counting them. In particular, one can approach the counting like this:

  1. For each interval I, consider the interval groups for left endpoints strictly between the endpoints of I.
  2. Within each of the groups considered, perform a binary search for an interval having right endpoint one greater than the right endpoint of I, or the position where such an interval would occur.
  3. All members of that group from that point to the end satisfy the overlap criterion, so add that number to the total count.

Complexity Analysis

The sorted interval list and group sizes / boundaries can be created at O(n2) cost via a two-level loop nest. There may be as many as n * (n - 1) intervals altogether, occurring when all input characters are the same, so the list requires O(n2) storage.

The intervals are grouped into exactly n - 1 groups, some of which may be empty. For each interval (O(n2)), we consider up to n - 2 of those, and perform a binary search (O(log n)) on each one. This yields O(n3 log n) overall operations.

That's an algorithmic improvement over the O(n4) cost of your original algorithm, though it remains to be seen whether the improved asymptotic complexity manifests improved performance for the specific problem sizes being tested.

Upvotes: 0

Sorevan
Sorevan

Reputation: 156

Since the string S is made of only lowercase letters, you can maintain a 26x26 table (actually 25x25, ignore when i=j) that holds the appearance of all possible distinct two letter cases (e.g. ab, ac, bc, etc).

The following code tracks the completeness of each answer candidate(abab, acac, bcbc, etc) by two functions: checking for the AC position and checking for the BD position. Once the value reaches 4, it means that the candidate is a valid answer.

#include <stdio.h>

int digitsAC(int a)
{
    if(a % 2 == 0)
        return a + 1;
    return a;
}

int digitsBD(int b)
{
    if(b % 2 == 1)
        return b + 1;
    return b;
}

int main()
{
    int n, count = 0;
    char s[2002];
    int appearance2x2[26][26] = {0};
    scanf("%d%s", &n, s);
    for(int i = 0; i < n; ++i)
    {
        int id = s[i] - 'a';
        for(int j = 0; j < 26; ++j)
        {
            appearance2x2[id][j] = digitsAC(appearance2x2[id][j]);
            appearance2x2[j][id] = digitsBD(appearance2x2[j][id]);  
        }
    }
    //counting the results
    for(int i = 0; i < 26; ++i)
    {
        for(int j = 0; j < 26; ++j)
        {
            if(i == j)continue;
            if(appearance2x2[i][j] >= 4)count += ((appearance2x2[i][j] - 2) / 2);
        }
    }
    printf("%d", count);
    return 0;
}

The time complexity is O(26N), which is equal to linear. The code can be further accelerated by making bitwise mask operations, but I left the functions simple for clearness. Haven't tested it a lot, please tell me if you find any bugs in it!

edit: There exists problem when handling continuous appearing letters like aabbaabb

Upvotes: 1

user3777427
user3777427

Reputation: 41

Performing the equality check in early stages can save you some time. Also the check a>=0 && b>a && c>b && d>c && d<n seems to be unnecessary as you are already checking for this condition in the loops. An improved version can be as follows:

#include<stdio.h>

int main()
{
    int n, count = 0;
    char s[2002];
    scanf("%d%s", &n, s);
    for(int a = 0; a<n-3; a++)
    {
        for(int b = a + 1; b<n-2; b++)
        {
            for(int c = b + 1; c<n-1; c++)
            {
                if(s[a] == s[c]) {
                    for(int d = c + 1; d<n; d++)
                    {
                        if(s[b] == s[d])
                        {
                            count++;
                        }
                    }
                }
            }
        }
    }
    printf("%d", count);
    return 0;
}

Upvotes: 1

Related Questions