Hugo Oliveira
Hugo Oliveira

Reputation: 11

Problem with Object Types operations in Julia for solving Rosalind problem "Open Reading Frames"

Code context: I am trying to solve the Rosalind problem "Open Reading Frames" (https://rosalind.info/problems/orf/). The approach I am using involves storing each ORF result in a variable called "orf" (a vector of Chars) and finally assigning them into the final result vector called Proteins (a vector of strings).

Error output:

ERROR: LoadError: MethodError: Cannot `convert` an object of type String to an object of type Char

Closest candidates are:
  convert(::Type{T}, ::Number) where T<:AbstractChar
   @ Base char.jl:184
  convert(::Type{T}, ::T) where T<:AbstractChar
   @ Base char.jl:187
  convert(::Type{T}, ::AbstractChar) where T<:AbstractChar
   @ Base char.jl:186
  ...

Stacktrace:
 [1] push!(a::Vector{Char}, item::String)
   @ Base ./array.jl:1060
 [2] find_orfs(sequence::String)
   @ Main ~/Documents/Codigos/RSLD_Open_Reading_Frames.jl:66
 [3] top-level scope
   @ ~/Documents/Codigos/RSLD_Open_Reading_Frames.jl:111

Code:


#=
The following code is proposed to complete the Rosalind activity "Open Reading Frames".
Given any DNA sequence, the algorithm should be capable to identify protein sequences that starts in an start codon and finishes at stop codon.
It should be achieved by navigating through the original and complementary sequences and accessing the 3 reading frames in each.
If the start codon is identified, the code should insert a 'M' into the sequence and consequently add the amino acids that corresponds to each codon.
Finally, the Protein vector of Strings should contain all possibilities of proteins.
=#

#Function to get the complementary strand given the DNA sequence
function reverse_complement(x::String)
    comp = Vector{Char}()
    for i in x
        if i == 'A'
            push!(comp, 'T')
        elseif i == 'T'
            push!(comp, 'A')
        elseif i == 'C'
            push!(comp, 'G')
        elseif i == 'G'
            push!(comp, 'C')
        end
    end
    return join(reverse(comp))
end

function find_orfs(sequence::String)

    #Creation of Codon dictionary
    codon_table = Dict(
        "TTT" => "F", "TTC" => "F",
        "TTA" => "L", "TTG" => "L", "CTT" => "L", "CTC" => "L", "CTA" => "L", "CTG" => "L",
        "ATT" => "I", "ATC" => "I", "ATA" => "I",
        "ATG" => "M",
        "GTT" => "V", "GTC" => "V", "GTA" => "V", "GTG" => "V",
        "TCT" => "S", "TCC" => "S", "TCA" => "S", "TCG" => "S", "AGT" => "S", "AGC" => "S",
        "CCT" => "P", "CCC" => "P", "CCA" => "P", "CCG" => "P",
        "ACT" => "T", "ACC" => "T", "ACA" => "T", "ACG" => "T",
        "GCT" => "A", "GCC" => "A", "GCA" => "A", "GCG" => "A",
        "TAT" => "Y", "TAC" => "Y",
        "TAA" => "STOP", "TAG" => "STOP", "TGA" => "STOP",
        "CAT" => "H", "CAC" => "H",
        "CAA" => "Q", "CAG" => "Q",
        "AAT" => "N", "AAC" => "N",
        "AAA" => "K", "AAG" => "K",
        "GAT" => "D", "GAC" => "D",
        "GAA" => "E", "GAG" => "E",
        "TGT" => "C", "TGC" => "C",
        "TGG" => "W",
        "CGT" => "R", "CGC" => "R", "CGA" => "R", "CGG" => "R", "AGA" => "R", "AGG" => "R",
        "GGT" => "G", "GGC" => "G", "GGA" => "G", "GGG" => "G"
    )  
   
    Proteins = Vector{String}()


    # Consider all three forward reading frames
        frames = [1, 2, 3]
    for j in frames
            for i in j:3:length(sequence) - 2
                orf = Vector{Char}()    #Vetor
                codon = sequence[i:i+2]

                if haskey(codon_table, codon)
                    amino_acid = codon_table[codon]
                    if amino_acid == "M" 
                        push!(orf, amino_acid)
                        continue  
                    elseif amino_acid == "STOP"
                        push!(Proteins, orf)
                        break  
                    else
                        push!(orf, amino_acid)
                    end
                else
                    error("Invalid codon: $codon")
                end
        end
    end


    comp_seq = reverse_complement(sequence)

    # Consider all three reverse reading frames
    for j in frames
            for i in j:3:length(comp_seq) - 2
                orf = Vector{Char}()
                codon = comp_seq[i:i+2]

                if haskey(codon_table, codon)
                    amino_acid = codon_table[codon]
                    if amino_acid == "M"
                        push!(orf, amino_acid)
                        continue  
                    elseif amino_acid == "STOP"
                        push!(Proteins, orf)
                        break  
                    else
                        push!(orf, amino_acid)
                    end
                else
                    error("Invalid codon: $codon")
                end
        end
    end

    return Proteins
end


sequence = "ATGGCCATGGCGCCCAGAACTGAGATCAATAGTACCCGTATAACGGGTGA"
result = find_orfs(sequence)
println(result)

I have already tried to make it change "orf" type to String, or AbstractString, but also doesn't work. Even tried to adapt the code to store each amino acid key in vector "orf" and then use its content as a string to put as argument in push! function, but have no positive result.

Upvotes: 0

Views: 53

Answers (1)

Sundar R
Sundar R

Reputation: 14735

As the Stacktrace indicates:

Stacktrace:
 [1] push!(a::Vector{Char}, item::String)
   @ Base ./array.jl:1060
 [2] find_orfs(sequence::String)
   @ Main ~/Documents/Codigos/RSLD_Open_Reading_Frames.jl:66

the problem is that you're trying to push! a string (item::String) into a vector of characters. In Julia, double quotes create strings, whereas single quotes create Char values. So the amino acids on the value side of the codon_table are all strings, not characters.

Since your goal is to get the Proteins as an array of strings, the easiest option here is to change orf to be a String, and concatenate the amino acids to it (as Dan suggests). So, initialize orf as orf = "", an empty string, and then instead of the push!(orf, amino_acid) lines, use orf *= amino_acid, which is shorthand for orf = orf * amino_acid (the * operator joins the existing orf with the new amino_acid string).

One more thing to note is that as it exists, this code resets orf to be empty after every amino acid, so the protein strings always end up empty. Setting the initial value for orf should be done outside the for i in j:3:length(comp_seq) - 2 loop, not inside it, to avoid this problem.

(I believe this code also has a logical error in not correctly dealing with the beginnings of codons, but I'll leave that to you to fix since that's part of the challenge.)

Upvotes: 1

Related Questions