#Seq Manipulation

import Bio as Bio
from Bio.Seq import Seq

my_seq = Seq("AGTACACTGGT")
#my_seq

#print(my_seq)
my_seq.complement()
my_seq.reverse_complement()

# FASTA Parsing
from Bio import SeqIO
for seq_record in SeqIO.parse("ProteinFastaResults2K.fasta", "fasta"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))

Seq like strings¶

3.2 Sequences act like strings¶

In many ways, we can deal with Seq objects as if they were normal Python strings, for example getting the length, or iterating over the elements:¶

from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
my_seq = Seq("GATCG", IUPAC.unambiguous_dna)
for index, letter in enumerate(my_seq):
    print("%i %s" % (index, letter))
   
#You can access elements of the sequence in the same way as for strings (but remember, Python counts from zero!):
print(my_seq[0]) #first letter
print(my_seq[2]) #third letter
print(my_seq[-1]) #last letter

Seq Count¶

The Seq object has a .count() method, just like a string. Note that this means that like a Python string, this gives a non-overlapping count:¶

from Bio.Seq import Seq
print("AAAA".count("AA"))
print(Seq("AAAA").count("AA"))

#DNA Perc
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC", IUPAC.unambiguous_dna)

print(len(my_seq))
print(my_seq.count("G"))
print(100 * float(my_seq.count("G") + my_seq.count("C")) / len(my_seq))

#Or 
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio.SeqUtils import GC
my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC", IUPAC.unambiguous_dna)
GC(my_seq)

#Seq as String
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC", IUPAC.unambiguous_dna)
print(my_seq[4:12])
print(my_seq[0::3])#First codon position
print(my_seq[1::3])#Second Codon Position
print(my_seq[2::3])#Third Codon Position
print(my_seq[::-1])#Reverse String

#Seq into Fasta
fasta_format_string = ">Name\n%s\n" % my_seq
print(fasta_format_string)

3.6 Changing case¶

Python strings have very useful upper and lower methods for changing the case. As of Biopython 1.53, the Seq object gained similar methods which are alphabet aware. For example,¶

from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
dna_seq = Seq("acgtACGT", generic_dna)
print(dna_seq)
print(dna_seq.upper())
print(dna_seq.lower())

#Transcription
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", IUPAC.unambiguous_dna)
print(coding_dna)

template_dna = coding_dna.reverse_complement()
print(template_dna)
print(coding_dna)

messenger_rna = coding_dna.transcribe()
print(messenger_rna)

#As you can see, all this does is switch T → U, and adjust the alphabet.
#If you do want to do a true biological transcription starting with the template strand, then this becomes a two-step process:
print(template_dna.reverse_complement().transcribe())

The Seq object also includes a back-transcription method for going from the mRNA to the coding strand of the DNA. Again, this is a simple U → T substitution and associated change of alphabet:¶

from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG", IUPAC.unambiguous_rna)
print(messenger_rna)
print(messenger_rna.back_transcribe())

Sticking with the same example discussed in the transcription section above, now let’s translate this mRNA into the corresponding protein sequence - again taking advantage of one of the Seq object’s biological methods:¶

from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG", IUPAC.unambiguous_rna)
print(messenger_rna)
print(messenger_rna.translate())

#You can also translate directly from the coding strand DNA sequence:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", IUPAC.unambiguous_dna)
print(coding_dna)
print(coding_dna.translate())

#Seq comparison
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
seq1 = Seq("ACGT", IUPAC.unambiguous_dna)
seq2 = Seq("ACGT", IUPAC.ambiguous_dna)
print(str(seq1) == str(seq2))
print(str(seq1) == str(seq1))

#In general Bio.SeqIO.parse() is used to read in sequence files as SeqRecord objects, and is typically used with a for loop like this:
from Bio import SeqIO
for seq_record in SeqIO.parse("ProteinFastaResults2K.fasta", "fasta"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))

Another very common way to use a Python iterator is within a list comprehension (or a generator expression). For example, if all you wanted to extract from the file was a list of the record identifiers we can easily do this with the following list comprehension:¶

from Bio import SeqIO
identifiers = [seq_record.id for seq_record in SeqIO.parse("ProteinFastaResults2K.fasta", "fasta")]
identifiers

The object returned by Bio.SeqIO is actually an iterator which returns SeqRecord objects. You get to see each record in turn, but once and only once. The plus point is that an iterator can save you memory when dealing with large files.¶

Instead of using a for loop, can also use the next() function on an iterator to step through the entries, like this:¶

from Bio import SeqIO
record_iterator = SeqIO.parse("ProteinFastaResults2K.fasta", "fasta")

first_record = next(record_iterator)
print(first_record.id)
print(first_record.description)

second_record = next(record_iterator)
print(second_record.id)
print(second_record.description)

Very often you need to be able to access the records in any order. The Python list data type is perfect for this, and we can turn the record iterator into a list of SeqRecord objects using the built-in Python function list() like so:¶

from Bio import SeqIO
records = list(SeqIO.parse("ProteinFastaResults2K.fasta", "fasta"))

print("Found %i records" % len(records))

print("The last record")
last_record = records[-1] #using Python's list tricks
print(last_record.id)
print(repr(last_record.seq))
print(len(last_record))

print("The first record")
first_record = records[0] #remember, Python counts from zero
print(first_record.id)
print(repr(first_record.seq))
print(len(first_record))

#Stored Annotations
from Bio import SeqIO
record_iterator = SeqIO.parse("ProteinFastaResults2K.fasta", "fasta")
first_record = next(record_iterator)
print(first_record)

#calculates the total length of the sequences
from Bio import SeqIO
print(sum(len(r) for r in SeqIO.parse("ProteinFastaResults2K.fasta", "fasta")))

#Here we use a file handle instead, using the with statement to close the handle automatically:
from Bio import SeqIO
with open("ProteinFastaResults2K.fasta") as handle:
    print(sum(len(r) for r in SeqIO.parse(handle, "fasta")))

#from Bio import SeqIO
handle = open("ProteinFastaResults2K.fasta")
print(sum(len(r) for r in SeqIO.parse(handle, "fasta")))

handle.close()

#Writing Fasta
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_protein

rec1 = SeqRecord(Seq("MMYQQGCFAGGTVLRLAKDLAENNRGARVLVVCSEITAVTFRGPSETHLDSMVGQALFGD" \
                    +"GAGAVIVGSDPDLSVERPLYELVWTGATLLPDSEGAIDGHLREVGLTFHLLKDVPGLISK" \
                    +"NIEKSLKEAFTPLGISDWNSTFWIAHPGGPAILDQVEAKLGLKEEKMRATREVLSEYGNM" \
                    +"SSAC", generic_protein),
                 id="gi|14150838|gb|AAK54648.1|AF376133_1",
                 description="chalcone synthase [Cucumis sativus]")

rec2 = SeqRecord(Seq("YPDYYFRITNREHKAELKEKFQRMCDKSMIKKRYMYLTEEILKENPSMCEYMAPSLDARQ" \
                    +"DMVVVEIPKLGKEAAVKAIKEWGQ", generic_protein),
                 id="gi|13919613|gb|AAK33142.1|",
                 description="chalcone synthase [Fragaria vesca subsp. bracteata]")

rec3 = SeqRecord(Seq("MVTVEEFRRAQCAEGPATVMAIGTATPSNCVDQSTYPDYYFRITNSEHKVELKEKFKRMC" \
                    +"EKSMIKKRYMHLTEEILKENPNICAYMAPSLDARQDIVVVEVPKLGKEAAQKAIKEWGQP" \
                    +"KSKITHLVFCTTSGVDMPGCDYQLTKLLGLRPSVKRFMMYQQGCFAGGTVLRMAKDLAEN" \
                    +"NKGARVLVVCSEITAVTFRGPNDTHLDSLVGQALFGDGAAAVIIGSDPIPEVERPLFELV" \
                    +"SAAQTLLPDSEGAIDGHLREVGLTFHLLKDVPGLISKNIEKSLVEAFQPLGISDWNSLFW" \
                    +"IAHPGGPAILDQVELKLGLKQEKLKATRKVLSNYGNMSSACVLFILDEMRKASAKEGLGT" \
                    +"TGEGLEWGVLFGFGPGLTVETVVLHSVAT", generic_protein),
                 id="gi|13925890|gb|AAK49457.1|",
                 description="chalcone synthase [Nicotiana tabacum]")

my_records = [rec1, rec2, rec3]

from Bio import SeqIO
SeqIO.write(my_records, "my_example.faa", "fasta")

We have two functions for reading in sequence alignments, Bio.AlignIO.read() and Bio.AlignIO.parse() which following the convention introduced in Bio.SeqIO are for files containing one or multiple alignments respectively.¶

Using Bio.AlignIO.parse() will return an iterator which gives MultipleSeqAlignment objects. Iterators are typically used in a for loop. Examples of situations where you will have multiple different alignments include resampled alignments from the PHYLIP tool seqboot, or multiple pairwise alignments from the EMBOSS tools water or needle, or Bill Pearson’s FASTA tools.¶

https://pfam.xfam.org/.¶

Phage_Coat_Gp8 (PF05371)¶

from Bio import AlignIO
alignment = AlignIO.read("PF05371_seed.txt", "stockholm")
print(alignment)

You’ll notice in the above output the sequences have been truncated. We could instead write our own code to format this as we please by iterating over the rows as SeqRecord objects:¶

from Bio import AlignIO
alignment = AlignIO.read("PF05371_seed.txt", "stockholm")
print("Alignment length %i" % alignment.get_alignment_length())

for record in alignment:
    print("%s - %s" % (record.seq, record.id))

handle = open("ProteinFastaResults2K.fasta")
for alignment in AlignIO.parse(handle, "fasta", seq_count=2):
    print("Alignment length %i" % alignment.get_alignment_length())
    for record in alignment:
        print("%s - %s" % (record.seq, record.id))
    print("")

Bio.AlignIO.write() which is for alignment output (writing files). This is a function taking three arguments: some MultipleSeqAlignment objects (or for backwards compatibility the obsolete Alignment objects), a handle or filename to write to, and a sequence format.¶

from Bio.Alphabet import generic_dna
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment

align1 = MultipleSeqAlignment([
             SeqRecord(Seq("ACTGCTAGCTAG", generic_dna), id="Alpha"),
             SeqRecord(Seq("ACT-CTAGCTAG", generic_dna), id="Beta"),
             SeqRecord(Seq("ACTGCTAGDTAG", generic_dna), id="Gamma"),
         ])

align2 = MultipleSeqAlignment([
             SeqRecord(Seq("GTCAGC-AG", generic_dna), id="Delta"),
             SeqRecord(Seq("GACAGCTAG", generic_dna), id="Epsilon"),
             SeqRecord(Seq("GTCAGCTAG", generic_dna), id="Zeta"),
         ])

align3 = MultipleSeqAlignment([
             SeqRecord(Seq("ACTAGTACAGCTG", generic_dna), id="Eta"),
             SeqRecord(Seq("ACTAGTACAGCT-", generic_dna), id="Theta"),
             SeqRecord(Seq("-CTACTACAGGTG", generic_dna), id="Iota"),
         ])

my_alignments = [align1, align2, align3]

from Bio import AlignIO
AlignIO.write(my_alignments, "my_example.phy", "phylip")

#converting formats
from Bio import AlignIO
count = AlignIO.convert("PF05371_seed.txt", "stockholm", "PF05371_seed.aln", "clustal")
print("Converted %i alignments" % count)

from Bio import AlignIO
alignments = AlignIO.parse("PF05371_seed.txt", "stockholm")
count = AlignIO.write(alignments, "PF05371_seed.aln", "clustal")
print("Converted %i alignments" % count)

First of all, in some senses the alignment objects act like a Python list of SeqRecord objects (the rows). With this model in mind hopefully the actions of len() (the number of rows) and iteration (each row as a SeqRecord) make sense:¶

from Bio import AlignIO
alignment = AlignIO.read("PF05371_seed.txt", "stockholm")
print("Number of rows: %i" % len(alignment))
for record in alignment:
    print("%s - %s" % (record.seq, record.id))

print(alignment)

print(alignment[3:7])

What if you wanted to select by column? Those of you who have used the NumPy matrix or array objects won’t be surprised at this - you use a double index.¶

from Bio import AlignIO
alignment = AlignIO.read("PF05371_seed.txt", "stockholm")
print(alignment[5, 6])
print(alignment[5].seq[6])
print(alignment[:, 6])

Depending on what you are doing, it can be more useful to turn the alignment object into an array of letters – and you can do this with NumPy:¶

import numpy as np
from Bio import AlignIO
alignment = AlignIO.read("PF05371_seed.txt", "stockholm")
align_array = np.array([list(rec) for rec in alignment], np.character)
print("Array shape %i by %i" % align_array.shape)

If you will be working heavily with the columns, you can tell NumPy to store the array by column (as in Fortran) rather then its default of by row (as in C):¶

align_array = np.array([list(rec) for rec in alignment], np.character, order="F")

#ClustalW tools 
#PhyloTree
#Muscle
#EMBOSS

#To generate pairwise alignments, se first create a PairwiseAligner object:
from Bio import Align
aligner = Align.PairwiseAligner()

#Use the aligner.score method to calculate the alignment score between two sequences:
seq1 = "GAACT"
seq2 = "GAT"
score = aligner.score(seq1, seq2)
score

#View ALN
alignments = aligner.align(seq1, seq2)
for alignment in alignments:
    print(alignment)

print(aligner)

#The match and mismatch scores are stored as attributes of an PairwiseAligner object:
from Bio import Align
aligner = Align.PairwiseAligner()
print(aligner.match)
print(aligner.mismatch)

score = aligner.score("AAA","AAC")
print(score)

aligner.match = 2.0
score2 = aligner.score("AAA","AAC")
print(score2)

#Iterate over ALN
from Bio import Align
aligner = Align.PairwiseAligner()
alignments = aligner.align("AAA", "AA")
len(alignments)

from Bio import Align
aligner = Align.PairwiseAligner()
alignments = aligner.align("AAA", "AA")
print(alignments[2])