Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
BIOL478-PP1-Extra-Credit---Hailey-Szadowski/PP1_HaileySzadowski.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
435 lines (338 sloc)
12.5 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# PP1 | |
from Bio import SeqIO | |
from Bio.Seq import Seq | |
import numpy as np | |
# read in sequences | |
recorddna = SeqIO.read("dna.fasta","fasta") # convert to string format | |
dnaseq = str(recorddna.seq) | |
recordpro = SeqIO.read("pro.fasta","fasta") | |
proseq = str(recordpro.seq) | |
# define reading frames | |
# reading frame 1 | |
dnaseq1 = dnaseq[0:] | |
frame1untrunc = len(dnaseq1) | |
# truncate to multiple of 3 | |
if frame1untrunc % 3 == 0: | |
frame1 = dnaseq1 | |
elif frame1untrunc % 3 == 1: | |
frame1 = dnaseq1[0:-1] | |
else: | |
frame1 = dnaseq1[0:-2] | |
# reading frame 2 | |
dnaseq2 = dnaseq[1:] | |
frame2untrunc = len(dnaseq2) | |
# truncate to multiple of 3 | |
if frame2untrunc % 3 == 0: | |
frame2 = dnaseq2 | |
elif frame2untrunc % 3 == 1: | |
frame2 = dnaseq2[0:-1] | |
else: | |
frame2 = dnaseq2[0:-2] | |
len(frame2) | |
# reading frame 3 | |
dnaseq3 = dnaseq[2:] | |
frame3untrunc = len(dnaseq3) | |
# truncate to multiple of 3 | |
if frame3untrunc % 3 == 0: | |
frame3 = dnaseq3 | |
elif frame3untrunc % 3 == 1: | |
frame3 = dnaseq3[0:-1] | |
else: | |
frame3 = dnaseq3[0:-2] | |
# translate into protein sequences | |
frame1temp = Seq(frame1) | |
frame1pro = str(frame1temp.translate()) | |
frame2temp = Seq(frame2) | |
frame2pro = str(frame2temp.translate()) | |
frame3temp = Seq(frame3) | |
frame3pro = str(frame3temp.translate()) | |
frames = [frame1pro,frame2pro,frame3pro] # compile into list | |
# read in scoring table, must be BLOSUM62 | |
scoring = np.loadtxt("BLOSUM62",skiprows=1,usecols=range(1,25)) | |
# list of AA in order of table | |
aa = ['A','R','N','D','C','Q','E','G','H','I','L','K','M','F','P','S','T','W','Y','V','B','Z','X','*'] | |
# define score lookup function | |
#a1 = sequence 1, number of rows | |
#a2 = sequence 2, number of columns | |
def scoreLookup(a1,a2): | |
for i in range(24): | |
if a1 == aa[i]: | |
i1 = i | |
if a2 == aa[i]: | |
i2 = i | |
score = scoring[i1,i2] | |
return score | |
# perform local alignment (Smith Waterman) | |
def localAlign(s1,s2): | |
# define dimensions | |
xdim = len(s1) | |
ydim = len(s2) | |
# create scoring matrix | |
alignmatrix = np.zeros((xdim,ydim)) | |
dirmatrix = np.chararray((xdim, ydim), itemsize=2) # matrix used to determine direction of optimal alignment | |
# set up outermost values | |
# row 1 | |
for i in range(ydim): | |
if scoreLookup(s1[0],s2[i]) >= 0: | |
alignmatrix[0,i] = scoreLookup(s1[0],s2[i]) | |
else: | |
alignmatrix[0,i] = 0 | |
dirmatrix[0,i] = 'LL' # define alignment direction | |
# column 1 | |
for i in range(xdim): | |
if scoreLookup(s1[i],s2[0]) >= 0: | |
alignmatrix[i,0] = scoreLookup(s1[i],s2[0]) | |
else: | |
alignmatrix[i,0] = 0 | |
dirmatrix[i,0] = 'UU' # define alignment direction | |
dirmatrix[0,0] = 'S' # start point | |
go = -10 # gap opening penalty | |
ge = -1 # gap extension penalty | |
# default values | |
gap = False | |
gaplength = 0 | |
# iterate through scoring matrix | |
for i in range(1,ydim): | |
for n in range(1,xdim): | |
# list to store all possible scores | |
possiblescores = [0,0,0,0] | |
# match/mismatch | |
possiblescores[0] = alignmatrix[n-1,i-1] + scoreLookup(s1[n],s2[i]) | |
# gap extension | |
if gap == True: | |
possiblescores[1] = alignmatrix[n-1,i] + ge*gaplength | |
possiblescores[2] = alignmatrix[n,i-1] + ge*gaplength | |
# gap opening | |
if gap == False: | |
possiblescores[1] = alignmatrix[n-1,i] + go | |
possiblescores[2] = alignmatrix[n,i-1] + go | |
# update gap length | |
if possiblescores[1] == max(possiblescores) or possiblescores[2] == max(possiblescores): | |
gaplength = gaplength + 1 | |
if possiblescores[1] == max(possiblescores): | |
dirmatrix[n, i] = 'UU' # define alignment direction | |
if possiblescores[2] == max(possiblescores): | |
dirmatrix[n, i] = 'LL' | |
else: | |
gaplength = 0 | |
if possiblescores[0] == max(possiblescores): | |
dirmatrix[n, i] = 'UL' # define alignment direction | |
if possiblescores[3] == max(possiblescores): | |
dirmatrix[n, i] = 'S' | |
alignmatrix[n,i] = max(possiblescores) # set position equal to max value | |
return alignmatrix, dirmatrix | |
# perform global alignment (Needleman Wunsch) | |
def globalAlign(s1,s2): | |
# define dimensions | |
xdim = len(s1) | |
ydim = len(s2) | |
# create storing matrix | |
alignmatrix = np.zeros((xdim,ydim)) | |
dirmatrix = np.chararray((xdim, ydim), itemsize=2) # matrix used to determine direction of optimal alignment | |
# set up outermost values | |
# row 1 | |
for i in range(ydim): | |
if scoreLookup(s1[0],s2[i]) >= 0: | |
alignmatrix[0,i] = scoreLookup(s1[0],s2[i]) | |
else: | |
alignmatrix[0,i] = 0 | |
dirmatrix[0,i] = 'LL' # define alignment direction | |
# column 1 | |
for i in range(xdim): | |
if scoreLookup(s1[i],s2[0]) >= 0: | |
alignmatrix[i,0] = scoreLookup(s1[i],s2[0]) | |
else: | |
alignmatrix[i,0] = 0 | |
dirmatrix[i,0] = 'UU' # define alignment direction | |
dirmatrix[0, 0] = 'S' | |
go = -10 # gap opening penalty | |
ge = -1 # gap extension penalty | |
# default values | |
gap = False | |
gaplength = 0 | |
# iterate through scoring matrix | |
for i in range(1,ydim): | |
for n in range(1,xdim): | |
# list to store all possible scores | |
possiblescores = [0,0,0] | |
# match/mismatch | |
possiblescores[0] = alignmatrix[n-1,i-1] + scoreLookup(s1[n],s2[i]) | |
# gap extension | |
if gap == True: | |
possiblescores[1] = alignmatrix[n-1,i] + ge*gaplength | |
possiblescores[2] = alignmatrix[n,i-1] + ge*gaplength | |
# gap opening | |
if gap == False: | |
possiblescores[1] = alignmatrix[n-1,i] + go | |
possiblescores[2] = alignmatrix[n,i-1] + go | |
# update gap length | |
if possiblescores[1] == max(possiblescores) or possiblescores[2] == max(possiblescores): | |
gaplength = gaplength + 1 | |
if possiblescores[1] == max(possiblescores): | |
dirmatrix[n, i] = 'UU' # define alignment direction | |
if possiblescores[2] == max(possiblescores): | |
dirmatrix[n, i] = 'LL' | |
else: | |
gaplength = 0 | |
if possiblescores[0] == max(possiblescores): | |
dirmatrix[n, i] = 'UL'# define alignment direction | |
alignmatrix[n,i] = max(possiblescores) # set position equal to max value | |
return alignmatrix, dirmatrix | |
# find best alignment and print it in desired format | |
def printAlignment(sm, dm, s1, s2, local=True): | |
a1, a2 = [], [] # a1 and a2 store the alignments for s1 and s2 respectively | |
xdim, ydim = sm.shape # find shape of the alignment table | |
if local: # the starting point for backtracking for local alignment is the largest score | |
_, x, y = max((v, x, y) for x, row in enumerate(sm) for y, v in enumerate(row)) # tiebreak by taking larger x values then larger y values | |
for _ in range(xdim - x - 1): # if we start at position x, we have xdim - x + 1 number of characters at the end | |
a1.append('.') | |
for _ in range(ydim - y - 1): | |
a2.append('.') | |
else: # global alignment, start at the end (bottom right corner) | |
x, y = xdim - 1, ydim - 1 | |
while x >= 0 and y >= 0: # repeatedly backtrack until at start | |
if local: # for a local alignment, once the score reaches 0, the alignment ends | |
if sm[x][y] == 0: | |
break | |
# based on the direction given in the direction matrix, backtrack alignment | |
# since starting from the end, prepend by inserting at 0 to maintain ordering of alignment | |
# LL and UU, skip aligning for one of the characters (gap) | |
if dm[x][y] == b'UL': | |
a1.insert(0, s1[x]) | |
a2.insert(0, s2[y]) | |
x, y = x - 1, y - 1 | |
elif dm[x][y] == b'LL': | |
a1.insert(0, '_') | |
a2.insert(0, s2[y]) | |
y = y - 1 | |
elif dm[x][y] == b'UU': | |
a1.insert(0, s1[x]) | |
a2.insert(0, '_') | |
x = x - 1 | |
elif dm[x][y] == b'S': | |
a1.insert(0, s1[x]) | |
a2.insert(0, s2[x]) | |
break | |
# local alignments do not start at the beginning of sequence, pad remaining number | |
# of characters | |
if local: | |
for _ in range(x): | |
a1.insert(0, '.') | |
for _ in range(y): | |
a2.insert(0, '.') | |
# pad to align if local alignments do not start at the same position | |
if y > x: | |
for _ in range((y - x)): | |
a1.insert(0, ' ') | |
elif y < x: | |
for _ in range((x - y)): | |
a2.insert(0, ' ') | |
# if all of the found alignments are skips, there is no alignment | |
if all(map(lambda x: x == '.' or x == ' ', a1)) and all(map(lambda x: x == '.' or x == ' ', a2)): | |
print("No alignment found") | |
return | |
# print first alignment | |
print("".join(a1)) | |
# print '|'s between matches | |
for x, y in zip(a1, a2): | |
print("|", end="") if x == y and x != '.' and y != '.' else print(" ", end="") | |
print() | |
# print second alignment | |
print("".join(a2)) | |
# local alignment between DNA/protein (chose not to do in loop in fear of breaking code) | |
frame1loc = localAlign(proseq,frames[0]) | |
frame2loc = localAlign(proseq,frames[1]) | |
frame3loc = localAlign(proseq,frames[2]) | |
# global alignment between DNA/protein | |
frame1glo = globalAlign(proseq,frames[0]) | |
frame2glo = globalAlign(proseq,frames[1]) | |
frame3glo = globalAlign(proseq,frames[2]) | |
# frame 1 local | |
frame12loc = localAlign(frames[0],frames[1]) | |
frame13loc = localAlign(frames[0],frames[2]) | |
# frame 1 global | |
frame12glo = globalAlign(frames[0],frames[1]) | |
frame13glo = globalAlign(frames[0],frames[2]) | |
# frame 2 local | |
frame21loc = localAlign(frames[1],frames[0]) | |
frame23loc = localAlign(frames[1],frames[2]) | |
# frame 2 global | |
frame21glo = globalAlign(frames[1],frames[0]) | |
frame23glo = globalAlign(frames[1],frames[2]) | |
# frame 3 local | |
frame31loc = localAlign(frames[2],frames[0]) | |
frame32loc = localAlign(frames[2],frames[1]) | |
# frame 3 global | |
frame31glo = globalAlign(frames[2],frames[0]) | |
frame32glo = globalAlign(frames[2],frames[1]) | |
# print results | |
print("LOCAL ALIGNMENT BETWEEN PROTEIN/DNA SEQUENCE") | |
print() | |
print("Frame 1") | |
printAlignment(*frame1loc,proseq,frames[0],local=True) | |
print() | |
print("Frame 2") | |
printAlignment(*frame2loc,proseq,frames[1],local=True) | |
print() | |
print("Frame 3") | |
printAlignment(*frame3loc,proseq,frames[2],local=True) | |
print() | |
print("GLOBAL ALIGNMENT BETWEEN PROTEIN/DNA SEQUENCE") | |
print() | |
print("Frame 1") | |
printAlignment(*frame1glo,proseq,frames[0],local=False) | |
print() | |
print("Frame 2") | |
printAlignment(*frame2glo,proseq,frames[1],local=False) | |
print() | |
print("Frame 3") | |
printAlignment(*frame3glo,proseq,frames[2],local=False) | |
print() | |
print("LOCAL ALIGNMENT WITH READING FRAME 1") | |
print() | |
print("Frame 2") | |
printAlignment(*frame12loc,frames[0],frames[1],local=True) | |
print() | |
print("Frame 3") | |
printAlignment(*frame13loc,frames[0],frames[2],local=True) | |
print() | |
print("GLOBAL ALIGNMENT WITH READING FRAME 1") | |
print() | |
print("Frame 2") | |
printAlignment(*frame12glo,frames[0],frames[1],local=False) | |
print() | |
print("Frame 3") | |
printAlignment(*frame13glo,frames[0],frames[2],local=False) | |
print() | |
print("LOCAL ALIGNMENT WITH READING FRAME 2") | |
print() | |
print("Frame 1") | |
printAlignment(*frame21loc,frames[1],frames[0],local=True) | |
print() | |
print("Frame 3") | |
printAlignment(*frame23loc,frames[1],frames[2],local=True) | |
print() | |
print("GLOBAL ALIGNMENT WITH READING FRAME 2") | |
print() | |
print("Frame 1") | |
printAlignment(*frame21glo,frames[1],frames[0],local=False) | |
print() | |
print("Frame 3") | |
printAlignment(*frame23glo,frames[1],frames[2],local=False) | |
print() | |
print("LOCAL ALIGNMENT WITH READING FRAME 3") | |
print() | |
print("Frame 1") | |
printAlignment(*frame31loc,frames[0],frames[2],local=True) | |
print() | |
print("Frame 2") | |
printAlignment(*frame32loc,frames[2],frames[1],local=True) | |
print() | |
print("GLOBAL ALIGNMENT WITH READING FRAME 3") | |
print() | |
print("Frame 1") | |
printAlignment(*frame31glo,frames[2],frames[0],local=False) | |
print() | |
print("Frame 2") | |
printAlignment(*frame32glo,frames[2],frames[1],local=False) | |
print() |