From 8075a5bf53eefe4051171b6cd504fd38bb4868d5 Mon Sep 17 00:00:00 2001 From: Warren Alphonso Date: Wed, 5 Oct 2022 22:37:07 +0200 Subject: [PATCH 1/4] parse for DNA, RNA --- src/nucleicacid.jl | 130 +++++++++++++++++++---------------- test/runtests.jl | 164 ++++++++++++++++++++++++++++----------------- 2 files changed, 175 insertions(+), 119 deletions(-) diff --git a/src/nucleicacid.jl b/src/nucleicacid.jl index b3aa74e..a330915 100644 --- a/src/nucleicacid.jl +++ b/src/nucleicacid.jl @@ -57,41 +57,59 @@ RNA(nt::DNA) = convert(RNA, nt) # Conversion from/to characters # ----------------------------- - -function Base.convert(::Type{DNA}, c::Char) - if c > '\uff' - throw(InexactError(:convert, DNA, c)) +function Base.convert(t::Union{Type{DNA},Type{RNA}}, c::Char) + nt = tryparse(t, c) + if nt === nothing + throw(InexactError(:convert, t, c)) end - @inbounds dna = char_to_dna[convert(Int, c) + 1] + return nt +end +DNA(c::Char) = convert(DNA, c) +RNA(c::Char) = convert(RNA, c) + +function Base.convert(::Type{Char}, nt::DNA) + return dna_to_char[encoded_data(nt)+1] +end +Char(nt::DNA) = convert(Char, nt) + +function Base.convert(::Type{Char}, nt::RNA) + return rna_to_char[encoded_data(nt)+1] +end +Char(nt::RNA) = convert(Char, nt) + +function Base.tryparse(::Type{DNA}, c::Char) + c > '\uff' && return nothing + @inbounds dna = char_to_dna[convert(Int, c)+1] if !isvalid(DNA, dna) - throw(InexactError(:convert, DNA, c)) + return nothing end return encode(DNA, dna) end -DNA(c::Char) = convert(DNA, c) -function Base.convert(::Type{RNA}, c::Char) - if c > '\uff' - throw(InexactError(:convert, RNA, c)) - end - @inbounds rna = char_to_rna[convert(Int, c) + 1] +function Base.tryparse(::Type{RNA}, c::Char) + c > '\uff' && return nothing + @inbounds rna = char_to_rna[convert(Int, c)+1] if !isvalid(RNA, rna) - throw(InexactError(:convert, RNA, c)) + return nothing end return encode(RNA, rna) -end -RNA(c::Char) = convert(RNA, c) -function Base.convert(::Type{Char}, nt::DNA) - return dna_to_char[encoded_data(nt) + 1] end -Char(nt::DNA) = convert(Char, nt) -function Base.convert(::Type{Char}, nt::RNA) - return rna_to_char[encoded_data(nt) + 1] +function Base.tryparse(t::Union{Type{DNA},Type{RNA}}, s::AbstractString) + sizeof(s) == 1 && return tryparse(t, first(s)) + stripped = strip(s) + sizeof(stripped) == 1 && return tryparse(t, first(stripped)) + return nothing end -Char(nt::RNA) = convert(Char, nt) +function Base.parse(t::Union{Type{DNA},Type{RNA}}, c::Union{AbstractString,Char}) + nt = tryparse(t, c) + if nt === nothing + throw(ArgumentError("invalid nucleotide")) + end + return nt +end # Encoding of DNA and RNA NucleicAcids # ------------------------------------ @@ -145,21 +163,21 @@ dna_to_char const dna_to_char = let chararray = Vector{Char}(undef, 16) for (char, doc, bits) in [ - ('-', "DNA Gap", 0b0000), - ('A', "DNA Adenine", 0b0001), - ('C', "DNA Cytosine", 0b0010), - ('G', "DNA Guanine", 0b0100), - ('T', "DNA Thymine", 0b1000), - ('M', "DNA Adenine or Cytosine", 0b0011), - ('R', "DNA Adenine or Guanine", 0b0101), - ('W', "DNA Adenine or Thymine", 0b1001), - ('S', "DNA Cytosine or Guanine", 0b0110), - ('Y', "DNA Cytosine or Thymine", 0b1010), - ('K', "DNA Guanine or Thymine", 0b1100), - ('V', "DNA Adenine, Cytosine or Guanine", 0b0111), - ('H', "DNA Adenine, Cytosine or Thymine", 0b1011), - ('D', "DNA Adenine, Guanine or Thymine", 0b1101), - ('B', "DNA Cytosine, Guanine or Thymine", 0b1110), + ('-', "DNA Gap", 0b0000), + ('A', "DNA Adenine", 0b0001), + ('C', "DNA Cytosine", 0b0010), + ('G', "DNA Guanine", 0b0100), + ('T', "DNA Thymine", 0b1000), + ('M', "DNA Adenine or Cytosine", 0b0011), + ('R', "DNA Adenine or Guanine", 0b0101), + ('W', "DNA Adenine or Thymine", 0b1001), + ('S', "DNA Cytosine or Guanine", 0b0110), + ('Y', "DNA Cytosine or Thymine", 0b1010), + ('K', "DNA Guanine or Thymine", 0b1100), + ('V', "DNA Adenine, Cytosine or Guanine", 0b0111), + ('H', "DNA Adenine, Cytosine or Thymine", 0b1011), + ('D', "DNA Adenine, Guanine or Thymine", 0b1101), + ('B', "DNA Cytosine, Guanine or Thymine", 0b1110), ('N', "DNA Adenine, Cytosine, Guanine or Thymine", 0b1111)] var = Symbol("DNA_", char != '-' ? char : "Gap") @eval begin @@ -273,22 +291,22 @@ rna_to_char const rna_to_char = let chararray = Vector{Char}(undef, 16) for (char, doc, dna) in [ - ('-', "RNA Gap", DNA_Gap), - ('A', "RNA Adenine", DNA_A ), - ('C', "RNA Cytosine", DNA_C ), - ('G', "RNA Guanine", DNA_G ), - ('U', "RNA Uracil", DNA_T ), - ('M', "RNA Adenine or Cytosine", DNA_M ), - ('R', "RNA Adenine or Guanine", DNA_R ), - ('W', "RNA Adenine or Uracil", DNA_W ), - ('S', "RNA Cytosine or Guanine", DNA_S ), - ('Y', "RNA Cytosine or Uracil", DNA_Y ), - ('K', "RNA Guanine or Uracil", DNA_K ), - ('V', "RNA Adenine, Cytosine or Guanine", DNA_V ), - ('H', "RNA Adenine, Cytosine or Uracil", DNA_H ), - ('D', "RNA Adenine, Guanine or Uracil", DNA_D ), - ('B', "RNA Cytosine, Guanine or Uracil", DNA_B ), - ('N', "RNA Adenine, Cytosine, Guanine or Uracil", DNA_N )] + ('-', "RNA Gap", DNA_Gap), + ('A', "RNA Adenine", DNA_A), + ('C', "RNA Cytosine", DNA_C), + ('G', "RNA Guanine", DNA_G), + ('U', "RNA Uracil", DNA_T), + ('M', "RNA Adenine or Cytosine", DNA_M), + ('R', "RNA Adenine or Guanine", DNA_R), + ('W', "RNA Adenine or Uracil", DNA_W), + ('S', "RNA Cytosine or Guanine", DNA_S), + ('Y', "RNA Cytosine or Uracil", DNA_Y), + ('K', "RNA Guanine or Uracil", DNA_K), + ('V', "RNA Adenine, Cytosine or Guanine", DNA_V), + ('H', "RNA Adenine, Cytosine or Uracil", DNA_H), + ('D', "RNA Adenine, Guanine or Uracil", DNA_D), + ('B', "RNA Cytosine, Guanine or Uracil", DNA_B), + ('N', "RNA Adenine, Cytosine, Guanine or Uracil", DNA_N)] var = Symbol("RNA_", char != '-' ? char : "Gap") @eval begin @doc $(doc) const $(var) = reinterpret(RNA, $(dna)) @@ -446,7 +464,7 @@ function complement(nt::NucleicAcid) (bits & 0x02) << 1 | (bits & 0x04) >> 1) end -function Base.isvalid(::Type{T}, x::Integer) where T <: NucleicAcid +function Base.isvalid(::Type{T}, x::Integer) where {T<:NucleicAcid} return 0 ≤ x < 16 end @@ -454,15 +472,15 @@ function Base.isvalid(nt::NucleicAcid) return encoded_data(nt) ≤ 0b1111 end -function Base.:~(x::N) where N <: NucleicAcid +function Base.:~(x::N) where {N<:NucleicAcid} return encode(N, encoded_data(x) ⊻ 0b1111) end -function Base.:|(x::N, y::N) where N <: NucleicAcid +function Base.:|(x::N, y::N) where {N<:NucleicAcid} return encode(N, encoded_data(x) | encoded_data(y)) end -function Base.:&(x::N, y::N) where N <: NucleicAcid +function Base.:&(x::N, y::N) where {N<:NucleicAcid} return encode(N, encoded_data(x) & encoded_data(y)) end diff --git a/test/runtests.jl b/test/runtests.jl index 79d0f76..badd7d2 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -54,49 +54,49 @@ end @testset "DNA conversions to UInt8" begin @test encoded_data(DNA_Gap) === 0b0000 - @test encoded_data(DNA_A) === 0b0001 - @test encoded_data(DNA_C) === 0b0010 - @test encoded_data(DNA_M) === 0b0011 - @test encoded_data(DNA_G) === 0b0100 - @test encoded_data(DNA_R) === 0b0101 - @test encoded_data(DNA_S) === 0b0110 - @test encoded_data(DNA_V) === 0b0111 - @test encoded_data(DNA_T) === 0b1000 - @test encoded_data(DNA_W) === 0b1001 - @test encoded_data(DNA_Y) === 0b1010 - @test encoded_data(DNA_H) === 0b1011 - @test encoded_data(DNA_K) === 0b1100 - @test encoded_data(DNA_D) === 0b1101 - @test encoded_data(DNA_B) === 0b1110 - @test encoded_data(DNA_N) === 0b1111 + @test encoded_data(DNA_A) === 0b0001 + @test encoded_data(DNA_C) === 0b0010 + @test encoded_data(DNA_M) === 0b0011 + @test encoded_data(DNA_G) === 0b0100 + @test encoded_data(DNA_R) === 0b0101 + @test encoded_data(DNA_S) === 0b0110 + @test encoded_data(DNA_V) === 0b0111 + @test encoded_data(DNA_T) === 0b1000 + @test encoded_data(DNA_W) === 0b1001 + @test encoded_data(DNA_Y) === 0b1010 + @test encoded_data(DNA_H) === 0b1011 + @test encoded_data(DNA_K) === 0b1100 + @test encoded_data(DNA_D) === 0b1101 + @test encoded_data(DNA_B) === 0b1110 + @test encoded_data(DNA_N) === 0b1111 end @testset "RNA conversions to UInt8" begin @test encoded_data(RNA_Gap) === 0b0000 - @test encoded_data(RNA_A) === 0b0001 - @test encoded_data(RNA_C) === 0b0010 - @test encoded_data(RNA_M) === 0b0011 - @test encoded_data(RNA_G) === 0b0100 - @test encoded_data(RNA_R) === 0b0101 - @test encoded_data(RNA_S) === 0b0110 - @test encoded_data(RNA_V) === 0b0111 - @test encoded_data(RNA_U) === 0b1000 - @test encoded_data(RNA_W) === 0b1001 - @test encoded_data(RNA_Y) === 0b1010 - @test encoded_data(RNA_H) === 0b1011 - @test encoded_data(RNA_K) === 0b1100 - @test encoded_data(RNA_D) === 0b1101 - @test encoded_data(RNA_B) === 0b1110 - @test encoded_data(RNA_N) === 0b1111 + @test encoded_data(RNA_A) === 0b0001 + @test encoded_data(RNA_C) === 0b0010 + @test encoded_data(RNA_M) === 0b0011 + @test encoded_data(RNA_G) === 0b0100 + @test encoded_data(RNA_R) === 0b0101 + @test encoded_data(RNA_S) === 0b0110 + @test encoded_data(RNA_V) === 0b0111 + @test encoded_data(RNA_U) === 0b1000 + @test encoded_data(RNA_W) === 0b1001 + @test encoded_data(RNA_Y) === 0b1010 + @test encoded_data(RNA_H) === 0b1011 + @test encoded_data(RNA_K) === 0b1100 + @test encoded_data(RNA_D) === 0b1101 + @test encoded_data(RNA_B) === 0b1110 + @test encoded_data(RNA_N) === 0b1111 end - @testset "stringbyte" begin - for T in (DNA, RNA) - @test all(alphabet(DNA)) do i - UInt8(Char(i)) == stringbyte(i) - end - end - end + @testset "stringbyte" begin + for T in (DNA, RNA) + @test all(alphabet(DNA)) do i + UInt8(Char(i)) == stringbyte(i) + end + end + end end @testset "Char" begin @@ -139,10 +139,10 @@ end @testset "Nucleic acid types" begin fromto = [(DNA_Gap, RNA_Gap), (DNA_A, RNA_A), (DNA_C, RNA_C), - (DNA_M, RNA_M), (DNA_G, RNA_G), (DNA_R, RNA_R), - (DNA_S, RNA_S), (DNA_V, RNA_V), (DNA_T, RNA_U), - (DNA_W, RNA_W), (DNA_Y, RNA_Y), (DNA_H, RNA_H), - (DNA_K, RNA_K), (DNA_D, RNA_D), (DNA_B, RNA_B), (DNA_N, RNA_N)] + (DNA_M, RNA_M), (DNA_G, RNA_G), (DNA_R, RNA_R), + (DNA_S, RNA_S), (DNA_V, RNA_V), (DNA_T, RNA_U), + (DNA_W, RNA_W), (DNA_Y, RNA_Y), (DNA_H, RNA_H), + (DNA_K, RNA_K), (DNA_D, RNA_D), (DNA_B, RNA_B), (DNA_N, RNA_N)] for (from, to) in fromto @test convert(RNA, from) === RNA(from) === to @@ -152,8 +152,8 @@ end end @testset "iscompatible" begin - @test iscompatible(DNA_A, DNA_A) - @test iscompatible(DNA_A, DNA_R) + @test iscompatible(DNA_A, DNA_A) + @test iscompatible(DNA_A, DNA_R) @test !iscompatible(DNA_C, DNA_A) @test !iscompatible(DNA_C, DNA_R) @@ -162,8 +162,8 @@ end @test iscompatible(DNA_N, x) == (x != DNA_Gap) end - @test iscompatible(RNA_A, RNA_A) - @test iscompatible(RNA_A, RNA_R) + @test iscompatible(RNA_A, RNA_A) + @test iscompatible(RNA_A, RNA_R) @test !iscompatible(RNA_C, RNA_A) @test !iscompatible(RNA_C, RNA_R) @@ -255,7 +255,7 @@ end @testset "Logic operations and Order" begin @testset "DNA" begin @test ~DNA_Gap === DNA_N - @test ~DNA_N === DNA_Gap + @test ~DNA_N === DNA_Gap @test DNA_A | DNA_C === DNA_M @test DNA_A & DNA_C === DNA_Gap @test_throws Exception DNA_A & RNA_A @@ -290,11 +290,11 @@ end end @testset "Broadcasting" begin - v = DNA[DNA_A, DNA_C, DNA_G, DNA_C] - v[2:3] .= DNA_A - @test v == [DNA_A, DNA_A, DNA_A, DNA_C] - v .= DNA_T - @test v == fill(DNA_T, 4) + v = DNA[DNA_A, DNA_C, DNA_G, DNA_C] + v[2:3] .= DNA_A + @test v == [DNA_A, DNA_A, DNA_A, DNA_C] + v .= DNA_T + @test v == fill(DNA_T, 4) end @testset "Show DNA" begin @@ -384,6 +384,44 @@ end @test collect(ACGUN) == [RNA_A, RNA_C, RNA_G, RNA_U, RNA_N] end + @testset "Parsers" begin + @testset "Valid Cases" begin + fromto = [('a', DNA_A), ('c', RNA_C)] + + for (from, to) in fromto + @test parse(typeof(to), from) === tryparse(typeof(to), from) === to + # Strings also work + str_from = string(from) + @test parse(typeof(to), str_from) === tryparse(typeof(to), str_from) === to + # Case doesn't matter + @test parse(typeof(to), uppercase(from)) === parse(typeof(to), lowercase(from)) === to + # Whitespace doesn't matter + whitespace_from = "\t" * from * " \n" + @test parse(typeof(to), whitespace_from) === tryparse(typeof(to), whitespace_from) === to + end + end + + @testset "Invalid Cases" begin + @test_throws ArgumentError parse(DNA, "") + @test_throws ArgumentError parse(RNA, "") + @test_throws ArgumentError parse(DNA, "U") + @test_throws ArgumentError parse(RNA, "T") + @test_throws ArgumentError parse(DNA, "AL") + @test_throws ArgumentError parse(RNA, "LA") + @test_throws ArgumentError parse(RNA, '\0') + @test_throws ArgumentError parse(DNA, '@') + @test_throws ArgumentError parse(DNA, '亜') + @test tryparse(DNA, "U") === tryparse(RNA, "T") == nothing + @test tryparse(DNA, "") === tryparse(RNA, "") == nothing + @test tryparse(DNA, "AL") === tryparse(RNA, "AL") === nothing + @test tryparse(DNA, "LA") === tryparse(RNA, "LA") === nothing + @test tryparse(DNA, "ALAA") === tryparse(RNA, "ALAA") === nothing + @test tryparse(DNA, '\0') === tryparse(RNA, '\0') === nothing + @test tryparse(DNA, '@') === tryparse(RNA, '@') === nothing + @test tryparse(DNA, '亜') === tryparse(RNA, '亜') === nothing + end + end + @testset "Hashing" begin @test hash(DNA_A) != hash(RNA_A) @test hash(DNA_A) != hash(DNA_G) @@ -409,12 +447,12 @@ end @test encode(AminoAcid, UInt8(10)) === AA_L for (c, aa) in [ - ('A', AA_A), ('R', AA_R), ('N', AA_N), ('D', AA_D), ('C', AA_C), - ('Q', AA_Q), ('E', AA_E), ('G', AA_G), ('H', AA_H), ('I', AA_I), - ('L', AA_L), ('K', AA_K), ('M', AA_M), ('F', AA_F), ('P', AA_P), - ('S', AA_S), ('T', AA_T), ('W', AA_W), ('Y', AA_Y), ('V', AA_V), - ('O', AA_O), ('U', AA_U), ('B', AA_B), ('J', AA_J), ('Z', AA_Z), - ('X', AA_X), ('*', AA_Term), ('-', AA_Gap)] + ('A', AA_A), ('R', AA_R), ('N', AA_N), ('D', AA_D), ('C', AA_C), + ('Q', AA_Q), ('E', AA_E), ('G', AA_G), ('H', AA_H), ('I', AA_I), + ('L', AA_L), ('K', AA_K), ('M', AA_M), ('F', AA_F), ('P', AA_P), + ('S', AA_S), ('T', AA_T), ('W', AA_W), ('Y', AA_Y), ('V', AA_V), + ('O', AA_O), ('U', AA_U), ('B', AA_B), ('J', AA_J), ('Z', AA_Z), + ('X', AA_X), ('*', AA_Term), ('-', AA_Gap)] @test convert(AminoAcid, c) === convert(AminoAcid, lowercase(c)) == AminoAcid(c) === aa @test Char(aa) === c end @@ -424,9 +462,9 @@ end end @testset "stringbyte" begin - @test all(alphabet(AminoAcid)) do i - UInt8(Char(i)) == stringbyte(i) - end + @test all(alphabet(AminoAcid)) do i + UInt8(Char(i)) == stringbyte(i) + end end @testset "isvalid" begin @@ -435,13 +473,13 @@ end end @test !isvalid(encode(AminoAcid, 0x1c)) @test !isvalid(encode(AminoAcid, 0xff)) - @test isvalid(AminoAcid, 0x1b) + @test isvalid(AminoAcid, 0x1b) @test !isvalid(AminoAcid, 0x1c) end @testset "Logic operations and Order" begin @test ~RNA_Gap === RNA_N - @test ~RNA_N === RNA_Gap + @test ~RNA_N === RNA_Gap @test RNA_A | RNA_C === RNA_M @test RNA_A & RNA_C === RNA_Gap @test (AA_A < AA_R < AA_N < AA_V < AA_O < AA_U < @@ -456,7 +494,7 @@ end end @testset "iscompatible" begin - @test iscompatible(AA_A, AA_A) + @test iscompatible(AA_A, AA_A) @test !iscompatible(AA_A, AA_R) for x in alphabet(AminoAcid) From b2daaea63dab35ab62dd3d8accb49d9062a3b94e Mon Sep 17 00:00:00 2001 From: Warren Alphonso Date: Wed, 5 Oct 2022 22:44:25 +0200 Subject: [PATCH 2/4] Formatting --- src/nucleicacid.jl | 70 ++++++++++++++++----------------- test/runtests.jl | 96 +++++++++++++++++++++++----------------------- 2 files changed, 83 insertions(+), 83 deletions(-) diff --git a/src/nucleicacid.jl b/src/nucleicacid.jl index a330915..b8fbaa3 100644 --- a/src/nucleicacid.jl +++ b/src/nucleicacid.jl @@ -163,21 +163,21 @@ dna_to_char const dna_to_char = let chararray = Vector{Char}(undef, 16) for (char, doc, bits) in [ - ('-', "DNA Gap", 0b0000), - ('A', "DNA Adenine", 0b0001), - ('C', "DNA Cytosine", 0b0010), - ('G', "DNA Guanine", 0b0100), - ('T', "DNA Thymine", 0b1000), - ('M', "DNA Adenine or Cytosine", 0b0011), - ('R', "DNA Adenine or Guanine", 0b0101), - ('W', "DNA Adenine or Thymine", 0b1001), - ('S', "DNA Cytosine or Guanine", 0b0110), - ('Y', "DNA Cytosine or Thymine", 0b1010), - ('K', "DNA Guanine or Thymine", 0b1100), - ('V', "DNA Adenine, Cytosine or Guanine", 0b0111), - ('H', "DNA Adenine, Cytosine or Thymine", 0b1011), - ('D', "DNA Adenine, Guanine or Thymine", 0b1101), - ('B', "DNA Cytosine, Guanine or Thymine", 0b1110), + ('-', "DNA Gap", 0b0000), + ('A', "DNA Adenine", 0b0001), + ('C', "DNA Cytosine", 0b0010), + ('G', "DNA Guanine", 0b0100), + ('T', "DNA Thymine", 0b1000), + ('M', "DNA Adenine or Cytosine", 0b0011), + ('R', "DNA Adenine or Guanine", 0b0101), + ('W', "DNA Adenine or Thymine", 0b1001), + ('S', "DNA Cytosine or Guanine", 0b0110), + ('Y', "DNA Cytosine or Thymine", 0b1010), + ('K', "DNA Guanine or Thymine", 0b1100), + ('V', "DNA Adenine, Cytosine or Guanine", 0b0111), + ('H', "DNA Adenine, Cytosine or Thymine", 0b1011), + ('D', "DNA Adenine, Guanine or Thymine", 0b1101), + ('B', "DNA Cytosine, Guanine or Thymine", 0b1110), ('N', "DNA Adenine, Cytosine, Guanine or Thymine", 0b1111)] var = Symbol("DNA_", char != '-' ? char : "Gap") @eval begin @@ -291,22 +291,22 @@ rna_to_char const rna_to_char = let chararray = Vector{Char}(undef, 16) for (char, doc, dna) in [ - ('-', "RNA Gap", DNA_Gap), - ('A', "RNA Adenine", DNA_A), - ('C', "RNA Cytosine", DNA_C), - ('G', "RNA Guanine", DNA_G), - ('U', "RNA Uracil", DNA_T), - ('M', "RNA Adenine or Cytosine", DNA_M), - ('R', "RNA Adenine or Guanine", DNA_R), - ('W', "RNA Adenine or Uracil", DNA_W), - ('S', "RNA Cytosine or Guanine", DNA_S), - ('Y', "RNA Cytosine or Uracil", DNA_Y), - ('K', "RNA Guanine or Uracil", DNA_K), - ('V', "RNA Adenine, Cytosine or Guanine", DNA_V), - ('H', "RNA Adenine, Cytosine or Uracil", DNA_H), - ('D', "RNA Adenine, Guanine or Uracil", DNA_D), - ('B', "RNA Cytosine, Guanine or Uracil", DNA_B), - ('N', "RNA Adenine, Cytosine, Guanine or Uracil", DNA_N)] + ('-', "RNA Gap", DNA_Gap), + ('A', "RNA Adenine", DNA_A ), + ('C', "RNA Cytosine", DNA_C ), + ('G', "RNA Guanine", DNA_G ), + ('U', "RNA Uracil", DNA_T ), + ('M', "RNA Adenine or Cytosine", DNA_M ), + ('R', "RNA Adenine or Guanine", DNA_R ), + ('W', "RNA Adenine or Uracil", DNA_W ), + ('S', "RNA Cytosine or Guanine", DNA_S ), + ('Y', "RNA Cytosine or Uracil", DNA_Y ), + ('K', "RNA Guanine or Uracil", DNA_K ), + ('V', "RNA Adenine, Cytosine or Guanine", DNA_V ), + ('H', "RNA Adenine, Cytosine or Uracil", DNA_H ), + ('D', "RNA Adenine, Guanine or Uracil", DNA_D ), + ('B', "RNA Cytosine, Guanine or Uracil", DNA_B ), + ('N', "RNA Adenine, Cytosine, Guanine or Uracil", DNA_N )] var = Symbol("RNA_", char != '-' ? char : "Gap") @eval begin @doc $(doc) const $(var) = reinterpret(RNA, $(dna)) @@ -464,7 +464,7 @@ function complement(nt::NucleicAcid) (bits & 0x02) << 1 | (bits & 0x04) >> 1) end -function Base.isvalid(::Type{T}, x::Integer) where {T<:NucleicAcid} +function Base.isvalid(::Type{T}, x::Integer) where T <: NucleicAcid return 0 ≤ x < 16 end @@ -472,15 +472,15 @@ function Base.isvalid(nt::NucleicAcid) return encoded_data(nt) ≤ 0b1111 end -function Base.:~(x::N) where {N<:NucleicAcid} +function Base.:~(x::N) where N <: NucleicAcid return encode(N, encoded_data(x) ⊻ 0b1111) end -function Base.:|(x::N, y::N) where {N<:NucleicAcid} +function Base.:|(x::N, y::N) where N <: NucleicAcid return encode(N, encoded_data(x) | encoded_data(y)) end -function Base.:&(x::N, y::N) where {N<:NucleicAcid} +function Base.:&(x::N, y::N) where N <: NucleicAcid return encode(N, encoded_data(x) & encoded_data(y)) end diff --git a/test/runtests.jl b/test/runtests.jl index badd7d2..a32ef6e 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -54,40 +54,40 @@ end @testset "DNA conversions to UInt8" begin @test encoded_data(DNA_Gap) === 0b0000 - @test encoded_data(DNA_A) === 0b0001 - @test encoded_data(DNA_C) === 0b0010 - @test encoded_data(DNA_M) === 0b0011 - @test encoded_data(DNA_G) === 0b0100 - @test encoded_data(DNA_R) === 0b0101 - @test encoded_data(DNA_S) === 0b0110 - @test encoded_data(DNA_V) === 0b0111 - @test encoded_data(DNA_T) === 0b1000 - @test encoded_data(DNA_W) === 0b1001 - @test encoded_data(DNA_Y) === 0b1010 - @test encoded_data(DNA_H) === 0b1011 - @test encoded_data(DNA_K) === 0b1100 - @test encoded_data(DNA_D) === 0b1101 - @test encoded_data(DNA_B) === 0b1110 - @test encoded_data(DNA_N) === 0b1111 + @test encoded_data(DNA_A) === 0b0001 + @test encoded_data(DNA_C) === 0b0010 + @test encoded_data(DNA_M) === 0b0011 + @test encoded_data(DNA_G) === 0b0100 + @test encoded_data(DNA_R) === 0b0101 + @test encoded_data(DNA_S) === 0b0110 + @test encoded_data(DNA_V) === 0b0111 + @test encoded_data(DNA_T) === 0b1000 + @test encoded_data(DNA_W) === 0b1001 + @test encoded_data(DNA_Y) === 0b1010 + @test encoded_data(DNA_H) === 0b1011 + @test encoded_data(DNA_K) === 0b1100 + @test encoded_data(DNA_D) === 0b1101 + @test encoded_data(DNA_B) === 0b1110 + @test encoded_data(DNA_N) === 0b1111 end @testset "RNA conversions to UInt8" begin @test encoded_data(RNA_Gap) === 0b0000 - @test encoded_data(RNA_A) === 0b0001 - @test encoded_data(RNA_C) === 0b0010 - @test encoded_data(RNA_M) === 0b0011 - @test encoded_data(RNA_G) === 0b0100 - @test encoded_data(RNA_R) === 0b0101 - @test encoded_data(RNA_S) === 0b0110 - @test encoded_data(RNA_V) === 0b0111 - @test encoded_data(RNA_U) === 0b1000 - @test encoded_data(RNA_W) === 0b1001 - @test encoded_data(RNA_Y) === 0b1010 - @test encoded_data(RNA_H) === 0b1011 - @test encoded_data(RNA_K) === 0b1100 - @test encoded_data(RNA_D) === 0b1101 - @test encoded_data(RNA_B) === 0b1110 - @test encoded_data(RNA_N) === 0b1111 + @test encoded_data(RNA_A) === 0b0001 + @test encoded_data(RNA_C) === 0b0010 + @test encoded_data(RNA_M) === 0b0011 + @test encoded_data(RNA_G) === 0b0100 + @test encoded_data(RNA_R) === 0b0101 + @test encoded_data(RNA_S) === 0b0110 + @test encoded_data(RNA_V) === 0b0111 + @test encoded_data(RNA_U) === 0b1000 + @test encoded_data(RNA_W) === 0b1001 + @test encoded_data(RNA_Y) === 0b1010 + @test encoded_data(RNA_H) === 0b1011 + @test encoded_data(RNA_K) === 0b1100 + @test encoded_data(RNA_D) === 0b1101 + @test encoded_data(RNA_B) === 0b1110 + @test encoded_data(RNA_N) === 0b1111 end @testset "stringbyte" begin @@ -139,10 +139,10 @@ end @testset "Nucleic acid types" begin fromto = [(DNA_Gap, RNA_Gap), (DNA_A, RNA_A), (DNA_C, RNA_C), - (DNA_M, RNA_M), (DNA_G, RNA_G), (DNA_R, RNA_R), - (DNA_S, RNA_S), (DNA_V, RNA_V), (DNA_T, RNA_U), - (DNA_W, RNA_W), (DNA_Y, RNA_Y), (DNA_H, RNA_H), - (DNA_K, RNA_K), (DNA_D, RNA_D), (DNA_B, RNA_B), (DNA_N, RNA_N)] + (DNA_M, RNA_M), (DNA_G, RNA_G), (DNA_R, RNA_R), + (DNA_S, RNA_S), (DNA_V, RNA_V), (DNA_T, RNA_U), + (DNA_W, RNA_W), (DNA_Y, RNA_Y), (DNA_H, RNA_H), + (DNA_K, RNA_K), (DNA_D, RNA_D), (DNA_B, RNA_B), (DNA_N, RNA_N)] for (from, to) in fromto @test convert(RNA, from) === RNA(from) === to @@ -152,8 +152,8 @@ end end @testset "iscompatible" begin - @test iscompatible(DNA_A, DNA_A) - @test iscompatible(DNA_A, DNA_R) + @test iscompatible(DNA_A, DNA_A) + @test iscompatible(DNA_A, DNA_R) @test !iscompatible(DNA_C, DNA_A) @test !iscompatible(DNA_C, DNA_R) @@ -162,8 +162,8 @@ end @test iscompatible(DNA_N, x) == (x != DNA_Gap) end - @test iscompatible(RNA_A, RNA_A) - @test iscompatible(RNA_A, RNA_R) + @test iscompatible(RNA_A, RNA_A) + @test iscompatible(RNA_A, RNA_R) @test !iscompatible(RNA_C, RNA_A) @test !iscompatible(RNA_C, RNA_R) @@ -255,7 +255,7 @@ end @testset "Logic operations and Order" begin @testset "DNA" begin @test ~DNA_Gap === DNA_N - @test ~DNA_N === DNA_Gap + @test ~DNA_N === DNA_Gap @test DNA_A | DNA_C === DNA_M @test DNA_A & DNA_C === DNA_Gap @test_throws Exception DNA_A & RNA_A @@ -447,12 +447,12 @@ end @test encode(AminoAcid, UInt8(10)) === AA_L for (c, aa) in [ - ('A', AA_A), ('R', AA_R), ('N', AA_N), ('D', AA_D), ('C', AA_C), - ('Q', AA_Q), ('E', AA_E), ('G', AA_G), ('H', AA_H), ('I', AA_I), - ('L', AA_L), ('K', AA_K), ('M', AA_M), ('F', AA_F), ('P', AA_P), - ('S', AA_S), ('T', AA_T), ('W', AA_W), ('Y', AA_Y), ('V', AA_V), - ('O', AA_O), ('U', AA_U), ('B', AA_B), ('J', AA_J), ('Z', AA_Z), - ('X', AA_X), ('*', AA_Term), ('-', AA_Gap)] + ('A', AA_A), ('R', AA_R), ('N', AA_N), ('D', AA_D), ('C', AA_C), + ('Q', AA_Q), ('E', AA_E), ('G', AA_G), ('H', AA_H), ('I', AA_I), + ('L', AA_L), ('K', AA_K), ('M', AA_M), ('F', AA_F), ('P', AA_P), + ('S', AA_S), ('T', AA_T), ('W', AA_W), ('Y', AA_Y), ('V', AA_V), + ('O', AA_O), ('U', AA_U), ('B', AA_B), ('J', AA_J), ('Z', AA_Z), + ('X', AA_X), ('*', AA_Term), ('-', AA_Gap)] @test convert(AminoAcid, c) === convert(AminoAcid, lowercase(c)) == AminoAcid(c) === aa @test Char(aa) === c end @@ -473,13 +473,13 @@ end end @test !isvalid(encode(AminoAcid, 0x1c)) @test !isvalid(encode(AminoAcid, 0xff)) - @test isvalid(AminoAcid, 0x1b) + @test isvalid(AminoAcid, 0x1b) @test !isvalid(AminoAcid, 0x1c) end @testset "Logic operations and Order" begin @test ~RNA_Gap === RNA_N - @test ~RNA_N === RNA_Gap + @test ~RNA_N === RNA_Gap @test RNA_A | RNA_C === RNA_M @test RNA_A & RNA_C === RNA_Gap @test (AA_A < AA_R < AA_N < AA_V < AA_O < AA_U < @@ -494,7 +494,7 @@ end end @testset "iscompatible" begin - @test iscompatible(AA_A, AA_A) + @test iscompatible(AA_A, AA_A) @test !iscompatible(AA_A, AA_R) for x in alphabet(AminoAcid) From 8267f462c31a1e1c6ddb4387b4271ce04f37ecad Mon Sep 17 00:00:00 2001 From: Warren Alphonso Date: Wed, 5 Oct 2022 22:48:31 +0200 Subject: [PATCH 3/4] CHANGELOG --- CHANGELOG.md | 2 ++ test/runtests.jl | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c000d42..32e9290 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Added +- `Base.parse` and `Base.tryparse` for `DNA` and `RNA` ## [4.0.0] ### Added diff --git a/test/runtests.jl b/test/runtests.jl index a32ef6e..6e8a959 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -386,7 +386,7 @@ end @testset "Parsers" begin @testset "Valid Cases" begin - fromto = [('a', DNA_A), ('c', RNA_C)] + fromto = [('a', DNA_A), ('c', RNA_C), ('s', DNA_S), ('s', RNA_S)] for (from, to) in fromto @test parse(typeof(to), from) === tryparse(typeof(to), from) === to From 93a5fcc289cd102ab8b35ebac2f075a37e0bd627 Mon Sep 17 00:00:00 2001 From: Warren Alphonso Date: Wed, 5 Oct 2022 22:57:57 +0200 Subject: [PATCH 4/4] Update docs with example --- docs/src/nucleicacids.md | 19 +++++++++++++++++++ docs/src/sequences.md | 4 ++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/docs/src/nucleicacids.md b/docs/src/nucleicacids.md index 0d67509..ee2714b 100644 --- a/docs/src/nucleicacids.md +++ b/docs/src/nucleicacids.md @@ -89,6 +89,25 @@ true ``` +`parse` also works on characters and strings in a case-insensitive way: +```jldoctest +julia> parse(DNA, "T") +DNA_T + +julia> parse(RNA, "U") +RNA_U + +julia> parse(RNA, "U") == parse(RNA, "u") +true + +julia> tryparse(DNA, "A") # tryparse returns either a DNA nucleotide or nothing +DNA_A + +julia> tryparse(DNA, "SD") + +``` + + `print` and `show` methods are defined to output the text representation of a symbol: ```jldoctest julia> print(DNA_A) # un-decorated text diff --git a/docs/src/sequences.md b/docs/src/sequences.md index 710dee6..9977d25 100644 --- a/docs/src/sequences.md +++ b/docs/src/sequences.md @@ -15,14 +15,14 @@ A quick way to create a DNA/RNA sequence is storing symbols in a vector. ```jldoctest julia> seq = [DNA_A, DNA_C, DNA_G, DNA_T] -4-element Array{DNA,1}: +4-element Vector{DNA}: DNA_A DNA_C DNA_G DNA_T julia> [convert(DNA, x) for x in "ACGT"] # from a string -4-element Array{DNA,1}: +4-element Vector{DNA}: DNA_A DNA_C DNA_G