generate a sequence with respect to subsequences in python

Question

I try to generate the following sequences.

text   = ACCCEBCE
target = 000000D0

a random text of different characters is generated. In the text sequence, if the following subsequences are found, the target is going to be D or E. Otherwise, the target will be 0.

ABC     -->  D
BCD     -->  E

I write the following code. It works well if I generate a small number of characters. But it does not give any output if I make timesteps = 1000 etc.

import string
import random as rn
import numpy as np
def is_subseq(x, y):
    it = iter(y)
    return all(any(c == ch for c in it) for ch in x)


def count(a, b, m, n):  
  
    # If both first and second string  
    # is empty, or if second string  
    # is empty, return 1  
    if ((m == 0 and n == 0) or n == 0):  
        return 1
  
    # If only first string is empty  
    # and second string is not empty, 
    # return 0  
    if (m == 0): 
        return 0
  
    # If last characters are same  
    # Recur for remaining strings by  
    # 1. considering last characters  
    #    of both strings  
    # 2. ignoring last character  
    #    of first string  
    if (a[m - 1] == b[n - 1]):  
        return (count(a, b, m - 1, n - 1) + 
                count(a, b, m - 1, n))  
    else: 
          
        # If last characters are different,  
        # ignore last char of first string  
        # and recur for remaining string  
        return count(a, b, m - 1, n)  

# create a sequence classification instance
def get_sequence(n_timesteps):

    alphabet="ABCDE"#string.ascii_uppercase 
    text = ''.join(rn.choices(alphabet, k=n_timesteps))
    print(text)

    seq_length=3
    subseqX = []
    subseqY = []
    for i in range(0, len(alphabet) - seq_length, 1):
        seq_in = alphabet[i:i + seq_length]
        seq_out = alphabet[i + seq_length]
        subseqX.append([char for char in seq_in])
        subseqY.append(seq_out)
        print(seq_in, "	-->	",seq_out)
    
    y2 = []
    match = 0 
    countlist=np.zeros(len(subseqX))
    for i, val in enumerate(text):
        found = False
        counter = 0
        for g, val2 in enumerate(subseqX):
            listToStr = ''.join(map(str, subseqX[g]))
            howmany = count(text[:i], listToStr, len(text[:i]),len(listToStr))
            if is_subseq(listToStr, text[:i]):
                if countlist[g] < howmany:
                    match = match + howmany
                    countlist[g] = howmany
                    temp = g
                    found = True
        if found:
            y2.append(subseqY[temp])
        else:
            y2.append(0)
    print("counter:	", counter)
    print(text)
    print(y2)
     
# define problem properties
n_timesteps = 100
get_sequence(n_timesteps)

It might be because of the depth of the recursive function. But I need to generate 1000 or 10000 characters. How can I fix this problem? Any ideas?

Alain T. · Accepted Answer

I'm not sure I understand all you're trying to do (lots of code there), but I believe this simplified form of the function should work. It maintains a set of subsequences seen so far. It only extends them by adding the next letter when it is encountered. This allows the flagging to know if the prefix to the sequence up to the current character has been seen before.

def flagSequence(S,letters="ABCDE",seqLen=3):
    subSeqs    = set()
    result     = "0"
    for c in S[:-1]:
        p = letters.index(c)
        subSeqs.add(c)
        if p>0:
            subSeqs.update([s+c for s in subSeqs if s[-1]==letters[p-1]])
        if p in range(seqLen-1,len(letters)-1) and letters[p-seqLen+1:p+1] in subSeqs:
            result += letters[p+1]
        else:
            result += "0"
    return result

output:

text = "BDBACCBECEECAEAEDCAACBCCDDDBBDEEDABDBDE"

print(text)
print(flagSequence(text))

BDBACCBECEECAEAEDCAACBCCDDDBBDEEDABDBDE
000000000D00D0000ED00D0DDEEE00E00E00E0E

with more letters:

alphabet=string.ascii_uppercase 
text  = ''.join(rn.choices(alphabet, k=10000))
flags = flagSequence(text,alphabet)
print(text[:60])
print(flags[:60])

CHUJKAMWCAAIBXGIZFHALAWWFDDELXREMOQQVXFPNYJRQESRVEJKIAQILYSJ...
000000000000000000000M000000FM00FN00000G0OZK0RFTS0FKLJ0RJMZT...

with longer sequences:

alphabet=string.ascii_uppercase 
text  = ''.join(rn.choices(alphabet, k=10000))
flags = flagSequence(text,alphabet,seqLen=10)
print(text[200:260])
print(flags[200:260])

...PMZCDQXAOHVMTRLYCNCJABGGNZYAWIHJJCQKMMAENQFHNQTOQOPPGHVQZXZU...
...00N0000Y000WN000Z0O0K0000O0Z0X00KK00LNN00O000O00P0PQQ00WR0Y0...

generate a sequence with respect to subsequences in python

Answers (1)

Related Questions