#Seq Manipulation
import Bio as Bio
from Bio.Seq import Seq
my_seq = Seq("AGTACACTGGT")
#my_seq
#print(my_seq)
my_seq.complement()
my_seq.reverse_complement()
# FASTA Parsing
from Bio import SeqIO
for seq_record in SeqIO.parse("ProteinFastaResults2K.fasta", "fasta"):
print(seq_record.id)
print(repr(seq_record.seq))
print(len(seq_record))
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
my_seq = Seq("GATCG", IUPAC.unambiguous_dna)
for index, letter in enumerate(my_seq):
print("%i %s" % (index, letter))
#You can access elements of the sequence in the same way as for strings (but remember, Python counts from zero!):
print(my_seq[0]) #first letter
print(my_seq[2]) #third letter
print(my_seq[-1]) #last letter
from Bio.Seq import Seq
print("AAAA".count("AA"))
print(Seq("AAAA").count("AA"))
#DNA Perc
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC", IUPAC.unambiguous_dna)
print(len(my_seq))
print(my_seq.count("G"))
print(100 * float(my_seq.count("G") + my_seq.count("C")) / len(my_seq))
#Or
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio.SeqUtils import GC
my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC", IUPAC.unambiguous_dna)
GC(my_seq)
#Seq as String
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC", IUPAC.unambiguous_dna)
print(my_seq[4:12])
print(my_seq[0::3])#First codon position
print(my_seq[1::3])#Second Codon Position
print(my_seq[2::3])#Third Codon Position
print(my_seq[::-1])#Reverse String
#Seq into Fasta
fasta_format_string = ">Name\n%s\n" % my_seq
print(fasta_format_string)
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
dna_seq = Seq("acgtACGT", generic_dna)
print(dna_seq)
print(dna_seq.upper())
print(dna_seq.lower())
#Transcription
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", IUPAC.unambiguous_dna)
print(coding_dna)
template_dna = coding_dna.reverse_complement()
print(template_dna)
print(coding_dna)
messenger_rna = coding_dna.transcribe()
print(messenger_rna)
#As you can see, all this does is switch T → U, and adjust the alphabet.
#If you do want to do a true biological transcription starting with the template strand, then this becomes a two-step process:
print(template_dna.reverse_complement().transcribe())
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG", IUPAC.unambiguous_rna)
print(messenger_rna)
print(messenger_rna.back_transcribe())
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG", IUPAC.unambiguous_rna)
print(messenger_rna)
print(messenger_rna.translate())
#You can also translate directly from the coding strand DNA sequence:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", IUPAC.unambiguous_dna)
print(coding_dna)
print(coding_dna.translate())
#Seq comparison
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
seq1 = Seq("ACGT", IUPAC.unambiguous_dna)
seq2 = Seq("ACGT", IUPAC.ambiguous_dna)
print(str(seq1) == str(seq2))
print(str(seq1) == str(seq1))
#In general Bio.SeqIO.parse() is used to read in sequence files as SeqRecord objects, and is typically used with a for loop like this:
from Bio import SeqIO
for seq_record in SeqIO.parse("ProteinFastaResults2K.fasta", "fasta"):
print(seq_record.id)
print(repr(seq_record.seq))
print(len(seq_record))
from Bio import SeqIO
identifiers = [seq_record.id for seq_record in SeqIO.parse("ProteinFastaResults2K.fasta", "fasta")]
identifiers
from Bio import SeqIO
record_iterator = SeqIO.parse("ProteinFastaResults2K.fasta", "fasta")
first_record = next(record_iterator)
print(first_record.id)
print(first_record.description)
second_record = next(record_iterator)
print(second_record.id)
print(second_record.description)
from Bio import SeqIO
records = list(SeqIO.parse("ProteinFastaResults2K.fasta", "fasta"))
print("Found %i records" % len(records))
print("The last record")
last_record = records[-1] #using Python's list tricks
print(last_record.id)
print(repr(last_record.seq))
print(len(last_record))
print("The first record")
first_record = records[0] #remember, Python counts from zero
print(first_record.id)
print(repr(first_record.seq))
print(len(first_record))
#Stored Annotations
from Bio import SeqIO
record_iterator = SeqIO.parse("ProteinFastaResults2K.fasta", "fasta")
first_record = next(record_iterator)
print(first_record)
#calculates the total length of the sequences
from Bio import SeqIO
print(sum(len(r) for r in SeqIO.parse("ProteinFastaResults2K.fasta", "fasta")))
#Here we use a file handle instead, using the with statement to close the handle automatically:
from Bio import SeqIO
with open("ProteinFastaResults2K.fasta") as handle:
print(sum(len(r) for r in SeqIO.parse(handle, "fasta")))
#from Bio import SeqIO
handle = open("ProteinFastaResults2K.fasta")
print(sum(len(r) for r in SeqIO.parse(handle, "fasta")))
handle.close()
#Writing Fasta
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_protein
rec1 = SeqRecord(Seq("MMYQQGCFAGGTVLRLAKDLAENNRGARVLVVCSEITAVTFRGPSETHLDSMVGQALFGD" \
+"GAGAVIVGSDPDLSVERPLYELVWTGATLLPDSEGAIDGHLREVGLTFHLLKDVPGLISK" \
+"NIEKSLKEAFTPLGISDWNSTFWIAHPGGPAILDQVEAKLGLKEEKMRATREVLSEYGNM" \
+"SSAC", generic_protein),
id="gi|14150838|gb|AAK54648.1|AF376133_1",
description="chalcone synthase [Cucumis sativus]")
rec2 = SeqRecord(Seq("YPDYYFRITNREHKAELKEKFQRMCDKSMIKKRYMYLTEEILKENPSMCEYMAPSLDARQ" \
+"DMVVVEIPKLGKEAAVKAIKEWGQ", generic_protein),
id="gi|13919613|gb|AAK33142.1|",
description="chalcone synthase [Fragaria vesca subsp. bracteata]")
rec3 = SeqRecord(Seq("MVTVEEFRRAQCAEGPATVMAIGTATPSNCVDQSTYPDYYFRITNSEHKVELKEKFKRMC" \
+"EKSMIKKRYMHLTEEILKENPNICAYMAPSLDARQDIVVVEVPKLGKEAAQKAIKEWGQP" \
+"KSKITHLVFCTTSGVDMPGCDYQLTKLLGLRPSVKRFMMYQQGCFAGGTVLRMAKDLAEN" \
+"NKGARVLVVCSEITAVTFRGPNDTHLDSLVGQALFGDGAAAVIIGSDPIPEVERPLFELV" \
+"SAAQTLLPDSEGAIDGHLREVGLTFHLLKDVPGLISKNIEKSLVEAFQPLGISDWNSLFW" \
+"IAHPGGPAILDQVELKLGLKQEKLKATRKVLSNYGNMSSACVLFILDEMRKASAKEGLGT" \
+"TGEGLEWGVLFGFGPGLTVETVVLHSVAT", generic_protein),
id="gi|13925890|gb|AAK49457.1|",
description="chalcone synthase [Nicotiana tabacum]")
my_records = [rec1, rec2, rec3]
from Bio import SeqIO
SeqIO.write(my_records, "my_example.faa", "fasta")
from Bio import AlignIO
alignment = AlignIO.read("PF05371_seed.txt", "stockholm")
print(alignment)
from Bio import AlignIO
alignment = AlignIO.read("PF05371_seed.txt", "stockholm")
print("Alignment length %i" % alignment.get_alignment_length())
for record in alignment:
print("%s - %s" % (record.seq, record.id))
handle = open("ProteinFastaResults2K.fasta")
for alignment in AlignIO.parse(handle, "fasta", seq_count=2):
print("Alignment length %i" % alignment.get_alignment_length())
for record in alignment:
print("%s - %s" % (record.seq, record.id))
print("")
from Bio.Alphabet import generic_dna
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
align1 = MultipleSeqAlignment([
SeqRecord(Seq("ACTGCTAGCTAG", generic_dna), id="Alpha"),
SeqRecord(Seq("ACT-CTAGCTAG", generic_dna), id="Beta"),
SeqRecord(Seq("ACTGCTAGDTAG", generic_dna), id="Gamma"),
])
align2 = MultipleSeqAlignment([
SeqRecord(Seq("GTCAGC-AG", generic_dna), id="Delta"),
SeqRecord(Seq("GACAGCTAG", generic_dna), id="Epsilon"),
SeqRecord(Seq("GTCAGCTAG", generic_dna), id="Zeta"),
])
align3 = MultipleSeqAlignment([
SeqRecord(Seq("ACTAGTACAGCTG", generic_dna), id="Eta"),
SeqRecord(Seq("ACTAGTACAGCT-", generic_dna), id="Theta"),
SeqRecord(Seq("-CTACTACAGGTG", generic_dna), id="Iota"),
])
my_alignments = [align1, align2, align3]
from Bio import AlignIO
AlignIO.write(my_alignments, "my_example.phy", "phylip")
#converting formats
from Bio import AlignIO
count = AlignIO.convert("PF05371_seed.txt", "stockholm", "PF05371_seed.aln", "clustal")
print("Converted %i alignments" % count)
from Bio import AlignIO
alignments = AlignIO.parse("PF05371_seed.txt", "stockholm")
count = AlignIO.write(alignments, "PF05371_seed.aln", "clustal")
print("Converted %i alignments" % count)
from Bio import AlignIO
alignment = AlignIO.read("PF05371_seed.txt", "stockholm")
print("Number of rows: %i" % len(alignment))
for record in alignment:
print("%s - %s" % (record.seq, record.id))
print(alignment)
print(alignment[3:7])
from Bio import AlignIO
alignment = AlignIO.read("PF05371_seed.txt", "stockholm")
print(alignment[5, 6])
print(alignment[5].seq[6])
print(alignment[:, 6])
import numpy as np
from Bio import AlignIO
alignment = AlignIO.read("PF05371_seed.txt", "stockholm")
align_array = np.array([list(rec) for rec in alignment], np.character)
print("Array shape %i by %i" % align_array.shape)
align_array = np.array([list(rec) for rec in alignment], np.character, order="F")
#ClustalW tools
#PhyloTree
#Muscle
#EMBOSS
#To generate pairwise alignments, se first create a PairwiseAligner object:
from Bio import Align
aligner = Align.PairwiseAligner()
#Use the aligner.score method to calculate the alignment score between two sequences:
seq1 = "GAACT"
seq2 = "GAT"
score = aligner.score(seq1, seq2)
score
#View ALN
alignments = aligner.align(seq1, seq2)
for alignment in alignments:
print(alignment)
print(aligner)
#The match and mismatch scores are stored as attributes of an PairwiseAligner object:
from Bio import Align
aligner = Align.PairwiseAligner()
print(aligner.match)
print(aligner.mismatch)
score = aligner.score("AAA","AAC")
print(score)
aligner.match = 2.0
score2 = aligner.score("AAA","AAC")
print(score2)
#Iterate over ALN
from Bio import Align
aligner = Align.PairwiseAligner()
alignments = aligner.align("AAA", "AA")
len(alignments)
from Bio import Align
aligner = Align.PairwiseAligner()
alignments = aligner.align("AAA", "AA")
print(alignments[2])