Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
# PP2
from Bio import SeqIO
import numpy as np
# read in protein sequences
# protein sequences must be pro1.fasta and pro2.fasta
recordpro1 = SeqIO.read("pro1.fasta","fasta") # convert to string format
pro1seq = str(recordpro1.seq)
recordpro2 = SeqIO.read("pro2.fasta","fasta")
pro2seq = str(recordpro2.seq)
# define number of desired alignments
# if no more alignments present, print no alignments found
numseq = 4
# read in scoring table, must be BLOSUM62
scoring = np.loadtxt("BLOSUM62",skiprows=1,usecols=range(1,25))
# list of AA in order of table
aa = ['A','R','N','D','C','Q','E','G','H','I','L','K','M','F','P','S','T','W','Y','V','B','Z','X','*']
# define score lookup function
#a1 = sequence 1, number of rows
#a2 = sequence 2, number of columns
def scoreLookup(a1,a2):
for i in range(24):
if a1 == aa[i]:
i1 = i
if a2 == aa[i]:
i2 = i
score = scoring[i1,i2]
return score
# perform local alignment (Smith Waterman)
def localAlign(s1,s2):
# define dimensions
xdim = len(s1)
ydim = len(s2)
# create scoring matrix
alignmatrix = np.zeros((xdim,ydim))
dirmatrix = np.chararray((xdim, ydim), itemsize=2) # matrix used to determine direction of optimal alignment
# set up outermost values
# row 1
for i in range(ydim):
if scoreLookup(s1[0],s2[i]) >= 0:
alignmatrix[0,i] = scoreLookup(s1[0],s2[i])
else:
alignmatrix[0,i] = 0
dirmatrix[0,i] = 'LL' # define alignment direction
# column 1
for i in range(xdim):
if scoreLookup(s1[i],s2[0]) >= 0:
alignmatrix[i,0] = scoreLookup(s1[i],s2[0])
else:
alignmatrix[i,0] = 0
dirmatrix[i,0] = 'UU' # define alignment direction
dirmatrix[0,0] = 'S'
go = -10 # gap opening penalty
ge = -1 # gap extension penalty
# default values
gap = False
gaplength = 0
# iterate through scoring matrix
for i in range(1,ydim):
for n in range(1,xdim):
# list to store all possible scores
possiblescores = [0,0,0,0]
# match/mismatch
possiblescores[0] = alignmatrix[n-1,i-1] + scoreLookup(s1[n],s2[i])
# gap extension
if gap == True:
possiblescores[1] = alignmatrix[n-1,i] + ge*gaplength
possiblescores[2] = alignmatrix[n,i-1] + ge*gaplength
# gap opening
if gap == False:
possiblescores[1] = alignmatrix[n-1,i] + go
possiblescores[2] = alignmatrix[n,i-1] + go
# update gap length
if possiblescores[1] == max(possiblescores) or possiblescores[2] == max(possiblescores):
gaplength = gaplength + 1
if possiblescores[1] == max(possiblescores):
dirmatrix[n, i] = 'UU' # define alignment direction
if possiblescores[2] == max(possiblescores):
dirmatrix[n, i] = 'LL'
else:
gaplength = 0
if possiblescores[0] == max(possiblescores):
dirmatrix[n, i] = 'UL' # define alignment direction
if possiblescores[3] == max(possiblescores):
dirmatrix[n, i] = 'S'
alignmatrix[n,i] = max(possiblescores) # set position equal to max value
return alignmatrix, dirmatrix
# print found alignments
def printAlignment(sm, dm, s1, s2, local=True):
a1, a2 = [], [] # a1 and a2 store the alignments for s1 and s2 respectively
xdim, ydim = sm.shape # find shape of the alignment table
if local: # the starting point for backtracking for local alignment is the largest score
_, x, y = max((v, x, y) for x, row in enumerate(sm) for y, v in enumerate(row)) # tiebreak by taking larger x values then larger y values
for _ in range(xdim - x - 1): # if we start at position x, we have xdim - x + 1 number of characters at the end
a1.append('.')
for _ in range(ydim - y - 1):
a2.append('.')
else: # global alignment, start at the end
x, y = xdim - 1, ydim - 1
while x >= 0 and y >= 0: # repeatedly backtrack until at start
if local: # for a local alignment, once the score reaches 0, the alignment ends
if sm[x][y] == 0:
break
# based on the direction given in the direction matrix, backtrack a;lignment
# since starting from the end, prepend by inserting at 0 to maintain ordering of alignment
# LL and UU, skip aligning for one of the characters (gap)
if dm[x][y] == b'UL':
a1.insert(0, s1[x])
a2.insert(0, s2[y])
x, y = x - 1, y - 1
elif dm[x][y] == b'LL':
a1.insert(0, '_')
a2.insert(0, s2[y])
y = y - 1
elif dm[x][y] == b'UU':
a1.insert(0, s1[x])
a2.insert(0, '_')
x = x - 1
elif dm[x][y] == b'S':
a1.insert(0, s1[x])
a2.insert(0, s2[x])
break
# local alignments do not start at the beginning of sequence, pad remaining number
# of characters
if local:
for _ in range(x+1):
a1.insert(0, '.')
for _ in range(y+1):
a2.insert(0, '.')
# pad to align if local alignments do not start at the same position
if y > x:
for _ in range((y - x)):
a1.insert(0, ' ')
elif y < x:
for _ in range((x - y)):
a2.insert(0, ' ')
# if all of the found alignments are skips, there is no alignment
if all(map(lambda x: x == '.' or x == ' ', a1)) and all(map(lambda x: x == '.' or x == ' ', a2)):
print("No alignment found")
return "", ""
# print first alignment
print("".join(a1))
# print '|'s between matches
for x, y in zip(a1, a2):
print("|", end="") if x == y and x != '.' and y != '.' else print(" ", end="")
print()
# print second alignment
print("".join(a2))
return a1, a2
# find non-overlapping non-optimial alignments
def nsubopts(n, s1, s2):
for _ in range(n+1): # n times! :)
if _ == 0:
print("Optimal Alignment:\n")
elif _ == 1:
print()
print("Suboptimal Alignments:\n")
sm, dm = localAlign(s1, s2) # perform alignment
a1, a2 = printAlignment(sm, dm, s1, s2, local=True) # print alignment, store alignment
print()
if not a1 or not a2: # if no alignment found, end iteration
break
for x, y in zip(a1, a2): # iterate over the alignment pairs
if x == "_" or y == "_" or x == "." or y == "." or x == " " or y == " ":
continue
for i in range(24): # identify position in scoring array
if x == aa[i]:
i1 = i
if y == aa[i]:
i2 = i
# score is now 0, as per hint
scoring[i1, i2] = 0
scoring[i2, i1] = 0
# find and print alignments
nsubopts(numseq, pro1seq, pro2seq)