Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
BIOL478-PP2-Extra-Credit--Hailey-Szadowski/PP2_HaileySzadowski.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
227 lines (174 sloc)
7.3 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# PP2 | |
from Bio import SeqIO | |
import numpy as np | |
# read in protein sequences | |
# protein sequences must be pro1.fasta and pro2.fasta | |
recordpro1 = SeqIO.read("pro1.fasta","fasta") # convert to string format | |
pro1seq = str(recordpro1.seq) | |
recordpro2 = SeqIO.read("pro2.fasta","fasta") | |
pro2seq = str(recordpro2.seq) | |
# define number of desired alignments | |
# if no more alignments present, print no alignments found | |
numseq = 4 | |
# read in scoring table, must be BLOSUM62 | |
scoring = np.loadtxt("BLOSUM62",skiprows=1,usecols=range(1,25)) | |
# list of AA in order of table | |
aa = ['A','R','N','D','C','Q','E','G','H','I','L','K','M','F','P','S','T','W','Y','V','B','Z','X','*'] | |
# define score lookup function | |
#a1 = sequence 1, number of rows | |
#a2 = sequence 2, number of columns | |
def scoreLookup(a1,a2): | |
for i in range(24): | |
if a1 == aa[i]: | |
i1 = i | |
if a2 == aa[i]: | |
i2 = i | |
score = scoring[i1,i2] | |
return score | |
# perform local alignment (Smith Waterman) | |
def localAlign(s1,s2): | |
# define dimensions | |
xdim = len(s1) | |
ydim = len(s2) | |
# create scoring matrix | |
alignmatrix = np.zeros((xdim,ydim)) | |
dirmatrix = np.chararray((xdim, ydim), itemsize=2) # matrix used to determine direction of optimal alignment | |
# set up outermost values | |
# row 1 | |
for i in range(ydim): | |
if scoreLookup(s1[0],s2[i]) >= 0: | |
alignmatrix[0,i] = scoreLookup(s1[0],s2[i]) | |
else: | |
alignmatrix[0,i] = 0 | |
dirmatrix[0,i] = 'LL' # define alignment direction | |
# column 1 | |
for i in range(xdim): | |
if scoreLookup(s1[i],s2[0]) >= 0: | |
alignmatrix[i,0] = scoreLookup(s1[i],s2[0]) | |
else: | |
alignmatrix[i,0] = 0 | |
dirmatrix[i,0] = 'UU' # define alignment direction | |
dirmatrix[0,0] = 'S' | |
go = -10 # gap opening penalty | |
ge = -1 # gap extension penalty | |
# default values | |
gap = False | |
gaplength = 0 | |
# iterate through scoring matrix | |
for i in range(1,ydim): | |
for n in range(1,xdim): | |
# list to store all possible scores | |
possiblescores = [0,0,0,0] | |
# match/mismatch | |
possiblescores[0] = alignmatrix[n-1,i-1] + scoreLookup(s1[n],s2[i]) | |
# gap extension | |
if gap == True: | |
possiblescores[1] = alignmatrix[n-1,i] + ge*gaplength | |
possiblescores[2] = alignmatrix[n,i-1] + ge*gaplength | |
# gap opening | |
if gap == False: | |
possiblescores[1] = alignmatrix[n-1,i] + go | |
possiblescores[2] = alignmatrix[n,i-1] + go | |
# update gap length | |
if possiblescores[1] == max(possiblescores) or possiblescores[2] == max(possiblescores): | |
gaplength = gaplength + 1 | |
if possiblescores[1] == max(possiblescores): | |
dirmatrix[n, i] = 'UU' # define alignment direction | |
if possiblescores[2] == max(possiblescores): | |
dirmatrix[n, i] = 'LL' | |
else: | |
gaplength = 0 | |
if possiblescores[0] == max(possiblescores): | |
dirmatrix[n, i] = 'UL' # define alignment direction | |
if possiblescores[3] == max(possiblescores): | |
dirmatrix[n, i] = 'S' | |
alignmatrix[n,i] = max(possiblescores) # set position equal to max value | |
return alignmatrix, dirmatrix | |
# print found alignments | |
def printAlignment(sm, dm, s1, s2, local=True): | |
a1, a2 = [], [] # a1 and a2 store the alignments for s1 and s2 respectively | |
xdim, ydim = sm.shape # find shape of the alignment table | |
if local: # the starting point for backtracking for local alignment is the largest score | |
_, x, y = max((v, x, y) for x, row in enumerate(sm) for y, v in enumerate(row)) # tiebreak by taking larger x values then larger y values | |
for _ in range(xdim - x - 1): # if we start at position x, we have xdim - x + 1 number of characters at the end | |
a1.append('.') | |
for _ in range(ydim - y - 1): | |
a2.append('.') | |
else: # global alignment, start at the end | |
x, y = xdim - 1, ydim - 1 | |
while x >= 0 and y >= 0: # repeatedly backtrack until at start | |
if local: # for a local alignment, once the score reaches 0, the alignment ends | |
if sm[x][y] == 0: | |
break | |
# based on the direction given in the direction matrix, backtrack a;lignment | |
# since starting from the end, prepend by inserting at 0 to maintain ordering of alignment | |
# LL and UU, skip aligning for one of the characters (gap) | |
if dm[x][y] == b'UL': | |
a1.insert(0, s1[x]) | |
a2.insert(0, s2[y]) | |
x, y = x - 1, y - 1 | |
elif dm[x][y] == b'LL': | |
a1.insert(0, '_') | |
a2.insert(0, s2[y]) | |
y = y - 1 | |
elif dm[x][y] == b'UU': | |
a1.insert(0, s1[x]) | |
a2.insert(0, '_') | |
x = x - 1 | |
elif dm[x][y] == b'S': | |
a1.insert(0, s1[x]) | |
a2.insert(0, s2[x]) | |
break | |
# local alignments do not start at the beginning of sequence, pad remaining number | |
# of characters | |
if local: | |
for _ in range(x+1): | |
a1.insert(0, '.') | |
for _ in range(y+1): | |
a2.insert(0, '.') | |
# pad to align if local alignments do not start at the same position | |
if y > x: | |
for _ in range((y - x)): | |
a1.insert(0, ' ') | |
elif y < x: | |
for _ in range((x - y)): | |
a2.insert(0, ' ') | |
# if all of the found alignments are skips, there is no alignment | |
if all(map(lambda x: x == '.' or x == ' ', a1)) and all(map(lambda x: x == '.' or x == ' ', a2)): | |
print("No alignment found") | |
return "", "" | |
# print first alignment | |
print("".join(a1)) | |
# print '|'s between matches | |
for x, y in zip(a1, a2): | |
print("|", end="") if x == y and x != '.' and y != '.' else print(" ", end="") | |
print() | |
# print second alignment | |
print("".join(a2)) | |
return a1, a2 | |
# find non-overlapping non-optimial alignments | |
def nsubopts(n, s1, s2): | |
for _ in range(n+1): # n times! :) | |
if _ == 0: | |
print("Optimal Alignment:\n") | |
elif _ == 1: | |
print() | |
print("Suboptimal Alignments:\n") | |
sm, dm = localAlign(s1, s2) # perform alignment | |
a1, a2 = printAlignment(sm, dm, s1, s2, local=True) # print alignment, store alignment | |
print() | |
if not a1 or not a2: # if no alignment found, end iteration | |
break | |
for x, y in zip(a1, a2): # iterate over the alignment pairs | |
if x == "_" or y == "_" or x == "." or y == "." or x == " " or y == " ": | |
continue | |
for i in range(24): # identify position in scoring array | |
if x == aa[i]: | |
i1 = i | |
if y == aa[i]: | |
i2 = i | |
# score is now 0, as per hint | |
scoring[i1, i2] = 0 | |
scoring[i2, i1] = 0 | |
# find and print alignments | |
nsubopts(numseq, pro1seq, pro2seq) | |