Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
# PP1
from Bio import SeqIO
from Bio.Seq import Seq
import numpy as np
# read in sequences
recorddna = SeqIO.read("dna.fasta","fasta") # convert to string format
dnaseq = str(recorddna.seq)
recordpro = SeqIO.read("pro.fasta","fasta")
proseq = str(recordpro.seq)
# define reading frames
# reading frame 1
dnaseq1 = dnaseq[0:]
frame1untrunc = len(dnaseq1)
# truncate to multiple of 3
if frame1untrunc % 3 == 0:
frame1 = dnaseq1
elif frame1untrunc % 3 == 1:
frame1 = dnaseq1[0:-1]
else:
frame1 = dnaseq1[0:-2]
# reading frame 2
dnaseq2 = dnaseq[1:]
frame2untrunc = len(dnaseq2)
# truncate to multiple of 3
if frame2untrunc % 3 == 0:
frame2 = dnaseq2
elif frame2untrunc % 3 == 1:
frame2 = dnaseq2[0:-1]
else:
frame2 = dnaseq2[0:-2]
len(frame2)
# reading frame 3
dnaseq3 = dnaseq[2:]
frame3untrunc = len(dnaseq3)
# truncate to multiple of 3
if frame3untrunc % 3 == 0:
frame3 = dnaseq3
elif frame3untrunc % 3 == 1:
frame3 = dnaseq3[0:-1]
else:
frame3 = dnaseq3[0:-2]
# translate into protein sequences
frame1temp = Seq(frame1)
frame1pro = str(frame1temp.translate())
frame2temp = Seq(frame2)
frame2pro = str(frame2temp.translate())
frame3temp = Seq(frame3)
frame3pro = str(frame3temp.translate())
frames = [frame1pro,frame2pro,frame3pro] # compile into list
# read in scoring table, must be BLOSUM62
scoring = np.loadtxt("BLOSUM62",skiprows=1,usecols=range(1,25))
# list of AA in order of table
aa = ['A','R','N','D','C','Q','E','G','H','I','L','K','M','F','P','S','T','W','Y','V','B','Z','X','*']
# define score lookup function
#a1 = sequence 1, number of rows
#a2 = sequence 2, number of columns
def scoreLookup(a1,a2):
for i in range(24):
if a1 == aa[i]:
i1 = i
if a2 == aa[i]:
i2 = i
score = scoring[i1,i2]
return score
# perform local alignment (Smith Waterman)
def localAlign(s1,s2):
# define dimensions
xdim = len(s1)
ydim = len(s2)
# create scoring matrix
alignmatrix = np.zeros((xdim,ydim))
dirmatrix = np.chararray((xdim, ydim), itemsize=2) # matrix used to determine direction of optimal alignment
# set up outermost values
# row 1
for i in range(ydim):
if scoreLookup(s1[0],s2[i]) >= 0:
alignmatrix[0,i] = scoreLookup(s1[0],s2[i])
else:
alignmatrix[0,i] = 0
dirmatrix[0,i] = 'LL' # define alignment direction
# column 1
for i in range(xdim):
if scoreLookup(s1[i],s2[0]) >= 0:
alignmatrix[i,0] = scoreLookup(s1[i],s2[0])
else:
alignmatrix[i,0] = 0
dirmatrix[i,0] = 'UU' # define alignment direction
dirmatrix[0,0] = 'S' # start point
go = -10 # gap opening penalty
ge = -1 # gap extension penalty
# default values
gap = False
gaplength = 0
# iterate through scoring matrix
for i in range(1,ydim):
for n in range(1,xdim):
# list to store all possible scores
possiblescores = [0,0,0,0]
# match/mismatch
possiblescores[0] = alignmatrix[n-1,i-1] + scoreLookup(s1[n],s2[i])
# gap extension
if gap == True:
possiblescores[1] = alignmatrix[n-1,i] + ge*gaplength
possiblescores[2] = alignmatrix[n,i-1] + ge*gaplength
# gap opening
if gap == False:
possiblescores[1] = alignmatrix[n-1,i] + go
possiblescores[2] = alignmatrix[n,i-1] + go
# update gap length
if possiblescores[1] == max(possiblescores) or possiblescores[2] == max(possiblescores):
gaplength = gaplength + 1
if possiblescores[1] == max(possiblescores):
dirmatrix[n, i] = 'UU' # define alignment direction
if possiblescores[2] == max(possiblescores):
dirmatrix[n, i] = 'LL'
else:
gaplength = 0
if possiblescores[0] == max(possiblescores):
dirmatrix[n, i] = 'UL' # define alignment direction
if possiblescores[3] == max(possiblescores):
dirmatrix[n, i] = 'S'
alignmatrix[n,i] = max(possiblescores) # set position equal to max value
return alignmatrix, dirmatrix
# perform global alignment (Needleman Wunsch)
def globalAlign(s1,s2):
# define dimensions
xdim = len(s1)
ydim = len(s2)
# create storing matrix
alignmatrix = np.zeros((xdim,ydim))
dirmatrix = np.chararray((xdim, ydim), itemsize=2) # matrix used to determine direction of optimal alignment
# set up outermost values
# row 1
for i in range(ydim):
if scoreLookup(s1[0],s2[i]) >= 0:
alignmatrix[0,i] = scoreLookup(s1[0],s2[i])
else:
alignmatrix[0,i] = 0
dirmatrix[0,i] = 'LL' # define alignment direction
# column 1
for i in range(xdim):
if scoreLookup(s1[i],s2[0]) >= 0:
alignmatrix[i,0] = scoreLookup(s1[i],s2[0])
else:
alignmatrix[i,0] = 0
dirmatrix[i,0] = 'UU' # define alignment direction
dirmatrix[0, 0] = 'S'
go = -10 # gap opening penalty
ge = -1 # gap extension penalty
# default values
gap = False
gaplength = 0
# iterate through scoring matrix
for i in range(1,ydim):
for n in range(1,xdim):
# list to store all possible scores
possiblescores = [0,0,0]
# match/mismatch
possiblescores[0] = alignmatrix[n-1,i-1] + scoreLookup(s1[n],s2[i])
# gap extension
if gap == True:
possiblescores[1] = alignmatrix[n-1,i] + ge*gaplength
possiblescores[2] = alignmatrix[n,i-1] + ge*gaplength
# gap opening
if gap == False:
possiblescores[1] = alignmatrix[n-1,i] + go
possiblescores[2] = alignmatrix[n,i-1] + go
# update gap length
if possiblescores[1] == max(possiblescores) or possiblescores[2] == max(possiblescores):
gaplength = gaplength + 1
if possiblescores[1] == max(possiblescores):
dirmatrix[n, i] = 'UU' # define alignment direction
if possiblescores[2] == max(possiblescores):
dirmatrix[n, i] = 'LL'
else:
gaplength = 0
if possiblescores[0] == max(possiblescores):
dirmatrix[n, i] = 'UL'# define alignment direction
alignmatrix[n,i] = max(possiblescores) # set position equal to max value
return alignmatrix, dirmatrix
# find best alignment and print it in desired format
def printAlignment(sm, dm, s1, s2, local=True):
a1, a2 = [], [] # a1 and a2 store the alignments for s1 and s2 respectively
xdim, ydim = sm.shape # find shape of the alignment table
if local: # the starting point for backtracking for local alignment is the largest score
_, x, y = max((v, x, y) for x, row in enumerate(sm) for y, v in enumerate(row)) # tiebreak by taking larger x values then larger y values
for _ in range(xdim - x - 1): # if we start at position x, we have xdim - x + 1 number of characters at the end
a1.append('.')
for _ in range(ydim - y - 1):
a2.append('.')
else: # global alignment, start at the end (bottom right corner)
x, y = xdim - 1, ydim - 1
while x >= 0 and y >= 0: # repeatedly backtrack until at start
if local: # for a local alignment, once the score reaches 0, the alignment ends
if sm[x][y] == 0:
break
# based on the direction given in the direction matrix, backtrack alignment
# since starting from the end, prepend by inserting at 0 to maintain ordering of alignment
# LL and UU, skip aligning for one of the characters (gap)
if dm[x][y] == b'UL':
a1.insert(0, s1[x])
a2.insert(0, s2[y])
x, y = x - 1, y - 1
elif dm[x][y] == b'LL':
a1.insert(0, '_')
a2.insert(0, s2[y])
y = y - 1
elif dm[x][y] == b'UU':
a1.insert(0, s1[x])
a2.insert(0, '_')
x = x - 1
elif dm[x][y] == b'S':
a1.insert(0, s1[x])
a2.insert(0, s2[x])
break
# local alignments do not start at the beginning of sequence, pad remaining number
# of characters
if local:
for _ in range(x):
a1.insert(0, '.')
for _ in range(y):
a2.insert(0, '.')
# pad to align if local alignments do not start at the same position
if y > x:
for _ in range((y - x)):
a1.insert(0, ' ')
elif y < x:
for _ in range((x - y)):
a2.insert(0, ' ')
# if all of the found alignments are skips, there is no alignment
if all(map(lambda x: x == '.' or x == ' ', a1)) and all(map(lambda x: x == '.' or x == ' ', a2)):
print("No alignment found")
return
# print first alignment
print("".join(a1))
# print '|'s between matches
for x, y in zip(a1, a2):
print("|", end="") if x == y and x != '.' and y != '.' else print(" ", end="")
print()
# print second alignment
print("".join(a2))
# local alignment between DNA/protein (chose not to do in loop in fear of breaking code)
frame1loc = localAlign(proseq,frames[0])
frame2loc = localAlign(proseq,frames[1])
frame3loc = localAlign(proseq,frames[2])
# global alignment between DNA/protein
frame1glo = globalAlign(proseq,frames[0])
frame2glo = globalAlign(proseq,frames[1])
frame3glo = globalAlign(proseq,frames[2])
# frame 1 local
frame12loc = localAlign(frames[0],frames[1])
frame13loc = localAlign(frames[0],frames[2])
# frame 1 global
frame12glo = globalAlign(frames[0],frames[1])
frame13glo = globalAlign(frames[0],frames[2])
# frame 2 local
frame21loc = localAlign(frames[1],frames[0])
frame23loc = localAlign(frames[1],frames[2])
# frame 2 global
frame21glo = globalAlign(frames[1],frames[0])
frame23glo = globalAlign(frames[1],frames[2])
# frame 3 local
frame31loc = localAlign(frames[2],frames[0])
frame32loc = localAlign(frames[2],frames[1])
# frame 3 global
frame31glo = globalAlign(frames[2],frames[0])
frame32glo = globalAlign(frames[2],frames[1])
# print results
print("LOCAL ALIGNMENT BETWEEN PROTEIN/DNA SEQUENCE")
print()
print("Frame 1")
printAlignment(*frame1loc,proseq,frames[0],local=True)
print()
print("Frame 2")
printAlignment(*frame2loc,proseq,frames[1],local=True)
print()
print("Frame 3")
printAlignment(*frame3loc,proseq,frames[2],local=True)
print()
print("GLOBAL ALIGNMENT BETWEEN PROTEIN/DNA SEQUENCE")
print()
print("Frame 1")
printAlignment(*frame1glo,proseq,frames[0],local=False)
print()
print("Frame 2")
printAlignment(*frame2glo,proseq,frames[1],local=False)
print()
print("Frame 3")
printAlignment(*frame3glo,proseq,frames[2],local=False)
print()
print("LOCAL ALIGNMENT WITH READING FRAME 1")
print()
print("Frame 2")
printAlignment(*frame12loc,frames[0],frames[1],local=True)
print()
print("Frame 3")
printAlignment(*frame13loc,frames[0],frames[2],local=True)
print()
print("GLOBAL ALIGNMENT WITH READING FRAME 1")
print()
print("Frame 2")
printAlignment(*frame12glo,frames[0],frames[1],local=False)
print()
print("Frame 3")
printAlignment(*frame13glo,frames[0],frames[2],local=False)
print()
print("LOCAL ALIGNMENT WITH READING FRAME 2")
print()
print("Frame 1")
printAlignment(*frame21loc,frames[1],frames[0],local=True)
print()
print("Frame 3")
printAlignment(*frame23loc,frames[1],frames[2],local=True)
print()
print("GLOBAL ALIGNMENT WITH READING FRAME 2")
print()
print("Frame 1")
printAlignment(*frame21glo,frames[1],frames[0],local=False)
print()
print("Frame 3")
printAlignment(*frame23glo,frames[1],frames[2],local=False)
print()
print("LOCAL ALIGNMENT WITH READING FRAME 3")
print()
print("Frame 1")
printAlignment(*frame31loc,frames[0],frames[2],local=True)
print()
print("Frame 2")
printAlignment(*frame32loc,frames[2],frames[1],local=True)
print()
print("GLOBAL ALIGNMENT WITH READING FRAME 3")
print()
print("Frame 1")
printAlignment(*frame31glo,frames[2],frames[0],local=False)
print()
print("Frame 2")
printAlignment(*frame32glo,frames[2],frames[1],local=False)
print()