PP2_HaileySzadowski.py

# PP2

from Bio import SeqIO
import numpy as np

# read in protein sequences
# protein sequences must be pro1.fasta and pro2.fasta

recordpro1 = SeqIO.read("pro1.fasta","fasta")  # convert to string format
pro1seq = str(recordpro1.seq)

recordpro2 = SeqIO.read("pro2.fasta","fasta")
pro2seq = str(recordpro2.seq)


# define number of desired alignments
# if no more alignments present, print no alignments found

numseq = 4

# read in scoring table, must be BLOSUM62

scoring = np.loadtxt("BLOSUM62",skiprows=1,usecols=range(1,25))

# list of AA in order of table
aa = ['A','R','N','D','C','Q','E','G','H','I','L','K','M','F','P','S','T','W','Y','V','B','Z','X','*']

# define score lookup function
#a1 = sequence 1, number of rows
#a2 = sequence 2, number of columns
def scoreLookup(a1,a2):
    for i in range(24):
        if a1 == aa[i]:
            i1 = i
        if a2 == aa[i]:
            i2 = i
    score = scoring[i1,i2]
    return score

# perform local alignment (Smith Waterman)

def localAlign(s1,s2):
    # define dimensions
    xdim = len(s1)
    ydim = len(s2)
    # create scoring matrix
    alignmatrix = np.zeros((xdim,ydim))

    dirmatrix = np.chararray((xdim, ydim), itemsize=2) # matrix used to determine direction of optimal alignment

    # set up outermost values

    # row 1
    for i in range(ydim):
        if scoreLookup(s1[0],s2[i]) >= 0:
            alignmatrix[0,i] = scoreLookup(s1[0],s2[i])
        else:
            alignmatrix[0,i] = 0
        dirmatrix[0,i] = 'LL' # define alignment direction

    # column 1
    for i in range(xdim):
        if scoreLookup(s1[i],s2[0]) >= 0:
            alignmatrix[i,0] = scoreLookup(s1[i],s2[0])
        else:
            alignmatrix[i,0] = 0
        dirmatrix[i,0] = 'UU' # define alignment direction
    dirmatrix[0,0] = 'S'

    go = -10 # gap opening penalty
    ge = -1 # gap extension penalty

    # default values
    gap = False
    gaplength = 0

    # iterate through scoring matrix
    for i in range(1,ydim):
        for n in range(1,xdim):

            # list to store all possible scores
            possiblescores = [0,0,0,0]

            # match/mismatch
            possiblescores[0] = alignmatrix[n-1,i-1] + scoreLookup(s1[n],s2[i])

            # gap extension
            if gap == True:
                possiblescores[1] = alignmatrix[n-1,i] + ge*gaplength
                possiblescores[2] = alignmatrix[n,i-1] + ge*gaplength

            # gap opening
            if gap == False:
                possiblescores[1] = alignmatrix[n-1,i] + go
                possiblescores[2] = alignmatrix[n,i-1] + go

            # update gap length
            if possiblescores[1] == max(possiblescores) or possiblescores[2] == max(possiblescores):
                gaplength = gaplength + 1
                if possiblescores[1] == max(possiblescores):
                    dirmatrix[n, i] = 'UU' # define alignment direction
                if possiblescores[2] == max(possiblescores):
                    dirmatrix[n, i] = 'LL'
            else:
                gaplength = 0
                if possiblescores[0] == max(possiblescores):
                    dirmatrix[n, i] = 'UL' # define alignment direction
                if possiblescores[3] == max(possiblescores):
                    dirmatrix[n, i] = 'S'
            alignmatrix[n,i] = max(possiblescores) # set position equal to max value

    return alignmatrix, dirmatrix

# print found alignments

def printAlignment(sm, dm, s1, s2, local=True):
    a1, a2 = [], [] # a1 and a2 store the alignments for s1 and s2 respectively
    xdim, ydim = sm.shape # find shape of the alignment table

    if local: # the starting point for backtracking for local alignment is the largest score
        _, x, y = max((v, x, y) for x, row in enumerate(sm) for y, v in enumerate(row)) # tiebreak by taking larger x values then larger y values
        for _ in range(xdim - x - 1): # if we start at position x, we have xdim - x + 1 number of characters at the end
            a1.append('.')
        for _ in range(ydim - y - 1):
            a2.append('.')
    else: # global alignment, start at the end
        x, y = xdim - 1, ydim - 1

    while x >= 0 and y >= 0: # repeatedly backtrack until at start
        if local: # for a local alignment, once the score reaches 0, the alignment ends
            if sm[x][y] == 0:
                break

        # based on the direction given in the direction matrix, backtrack a;lignment
        # since starting from the end, prepend by inserting at 0 to maintain ordering of alignment

        # LL and UU, skip aligning for one of the characters (gap)
        if dm[x][y] == b'UL':
            a1.insert(0, s1[x])
            a2.insert(0, s2[y])
            x, y = x - 1, y - 1

        elif dm[x][y] == b'LL':
            a1.insert(0, '_')
            a2.insert(0, s2[y])
            y = y - 1

        elif dm[x][y] == b'UU':
            a1.insert(0, s1[x])
            a2.insert(0, '_')
            x = x - 1

        elif dm[x][y] == b'S':
            a1.insert(0, s1[x])
            a2.insert(0, s2[x])
            break

    # local alignments do not start at the beginning of sequence, pad remaining number
    # of characters
    if local:
        for _ in range(x+1):
            a1.insert(0, '.')
        for _ in range(y+1):
            a2.insert(0, '.')

        # pad to align if local alignments do not start at the same position
        if y > x:
            for _ in range((y - x)):
                a1.insert(0, ' ')
        elif y < x:
            for _ in range((x - y)):
                a2.insert(0, ' ')

    # if all of the found alignments are skips, there is no alignment
    if all(map(lambda x: x == '.' or x == ' ', a1)) and all(map(lambda x: x == '.' or x == ' ', a2)):
        print("No alignment found")
        return "", ""

    # print first alignment
    print("".join(a1))
    # print '|'s between matches
    for x, y in zip(a1, a2):
        print("|", end="") if x == y and x != '.' and y != '.' else print(" ", end="")
    print()
    # print second alignment
    print("".join(a2))

    return a1, a2

# find non-overlapping non-optimial alignments

def nsubopts(n, s1, s2):
    for _ in range(n+1): # n times! :)
        if _ == 0:
            print("Optimal Alignment:\n")

        elif _ == 1:
            print()
            print("Suboptimal Alignments:\n")

        sm, dm = localAlign(s1, s2) # perform alignment
        a1, a2 = printAlignment(sm, dm, s1, s2, local=True) # print alignment, store alignment

        print()

        if not a1 or not a2: # if no alignment found, end iteration
            break

        for x, y in zip(a1, a2): # iterate over the alignment pairs
            if x == "_" or y == "_" or x == "." or y == "." or x == " " or y == " ":
                continue

            for i in range(24): # identify position in scoring array
                if x == aa[i]:
                    i1 = i
                if y == aa[i]:
                    i2 = i

            # score is now 0, as per hint
            scoring[i1, i2] = 0
            scoring[i2, i1] = 0


# find and print alignments
nsubopts(numseq, pro1seq, pro2seq)
	# PP2

	from Bio import SeqIO
	import numpy as np

	# read in protein sequences
	# protein sequences must be pro1.fasta and pro2.fasta

	recordpro1 = SeqIO.read("pro1.fasta","fasta") # convert to string format
	pro1seq = str(recordpro1.seq)

	recordpro2 = SeqIO.read("pro2.fasta","fasta")
	pro2seq = str(recordpro2.seq)


	# define number of desired alignments
	# if no more alignments present, print no alignments found

	numseq = 4

	# read in scoring table, must be BLOSUM62

	scoring = np.loadtxt("BLOSUM62",skiprows=1,usecols=range(1,25))

	# list of AA in order of table
	aa = ['A','R','N','D','C','Q','E','G','H','I','L','K','M','F','P','S','T','W','Y','V','B','Z','X','*']

	# define score lookup function
	#a1 = sequence 1, number of rows
	#a2 = sequence 2, number of columns
	def scoreLookup(a1,a2):
	for i in range(24):
	if a1 == aa[i]:
	i1 = i
	if a2 == aa[i]:
	i2 = i
	score = scoring[i1,i2]
	return score

	# perform local alignment (Smith Waterman)

	def localAlign(s1,s2):
	# define dimensions
	xdim = len(s1)
	ydim = len(s2)
	# create scoring matrix
	alignmatrix = np.zeros((xdim,ydim))

	dirmatrix = np.chararray((xdim, ydim), itemsize=2) # matrix used to determine direction of optimal alignment

	# set up outermost values

	# row 1
	for i in range(ydim):
	if scoreLookup(s1[0],s2[i]) >= 0:
	alignmatrix[0,i] = scoreLookup(s1[0],s2[i])
	else:
	alignmatrix[0,i] = 0
	dirmatrix[0,i] = 'LL' # define alignment direction

	# column 1
	for i in range(xdim):
	if scoreLookup(s1[i],s2[0]) >= 0:
	alignmatrix[i,0] = scoreLookup(s1[i],s2[0])
	else:
	alignmatrix[i,0] = 0
	dirmatrix[i,0] = 'UU' # define alignment direction
	dirmatrix[0,0] = 'S'

	go = -10 # gap opening penalty
	ge = -1 # gap extension penalty

	# default values
	gap = False
	gaplength = 0

	# iterate through scoring matrix
	for i in range(1,ydim):
	for n in range(1,xdim):

	# list to store all possible scores
	possiblescores = [0,0,0,0]

	# match/mismatch
	possiblescores[0] = alignmatrix[n-1,i-1] + scoreLookup(s1[n],s2[i])

	# gap extension
	if gap == True:
	possiblescores[1] = alignmatrix[n-1,i] + ge*gaplength
	possiblescores[2] = alignmatrix[n,i-1] + ge*gaplength

	# gap opening
	if gap == False:
	possiblescores[1] = alignmatrix[n-1,i] + go
	possiblescores[2] = alignmatrix[n,i-1] + go

	# update gap length
	if possiblescores[1] == max(possiblescores) or possiblescores[2] == max(possiblescores):
	gaplength = gaplength + 1
	if possiblescores[1] == max(possiblescores):
	dirmatrix[n, i] = 'UU' # define alignment direction
	if possiblescores[2] == max(possiblescores):
	dirmatrix[n, i] = 'LL'
	else:
	gaplength = 0
	if possiblescores[0] == max(possiblescores):
	dirmatrix[n, i] = 'UL' # define alignment direction
	if possiblescores[3] == max(possiblescores):
	dirmatrix[n, i] = 'S'
	alignmatrix[n,i] = max(possiblescores) # set position equal to max value

	return alignmatrix, dirmatrix

	# print found alignments

	def printAlignment(sm, dm, s1, s2, local=True):
	a1, a2 = [], [] # a1 and a2 store the alignments for s1 and s2 respectively
	xdim, ydim = sm.shape # find shape of the alignment table

	if local: # the starting point for backtracking for local alignment is the largest score
	_, x, y = max((v, x, y) for x, row in enumerate(sm) for y, v in enumerate(row)) # tiebreak by taking larger x values then larger y values
	for _ in range(xdim - x - 1): # if we start at position x, we have xdim - x + 1 number of characters at the end
	a1.append('.')
	for _ in range(ydim - y - 1):
	a2.append('.')
	else: # global alignment, start at the end
	x, y = xdim - 1, ydim - 1

	while x >= 0 and y >= 0: # repeatedly backtrack until at start
	if local: # for a local alignment, once the score reaches 0, the alignment ends
	if sm[x][y] == 0:
	break

	# based on the direction given in the direction matrix, backtrack a;lignment
	# since starting from the end, prepend by inserting at 0 to maintain ordering of alignment

	# LL and UU, skip aligning for one of the characters (gap)
	if dm[x][y] == b'UL':
	a1.insert(0, s1[x])
	a2.insert(0, s2[y])
	x, y = x - 1, y - 1

	elif dm[x][y] == b'LL':
	a1.insert(0, '_')
	a2.insert(0, s2[y])
	y = y - 1

	elif dm[x][y] == b'UU':
	a1.insert(0, s1[x])
	a2.insert(0, '_')
	x = x - 1

	elif dm[x][y] == b'S':
	a1.insert(0, s1[x])
	a2.insert(0, s2[x])
	break

	# local alignments do not start at the beginning of sequence, pad remaining number
	# of characters
	if local:
	for _ in range(x+1):
	a1.insert(0, '.')
	for _ in range(y+1):
	a2.insert(0, '.')

	# pad to align if local alignments do not start at the same position
	if y > x:
	for _ in range((y - x)):
	a1.insert(0, ' ')
	elif y < x:
	for _ in range((x - y)):
	a2.insert(0, ' ')

	# if all of the found alignments are skips, there is no alignment
	if all(map(lambda x: x == '.' or x == ' ', a1)) and all(map(lambda x: x == '.' or x == ' ', a2)):
	print("No alignment found")
	return "", ""

	# print first alignment
	print("".join(a1))
	# print '\|'s between matches
	for x, y in zip(a1, a2):
	print("\|", end="") if x == y and x != '.' and y != '.' else print(" ", end="")
	print()
	# print second alignment
	print("".join(a2))

	return a1, a2

	# find non-overlapping non-optimial alignments

	def nsubopts(n, s1, s2):
	for _ in range(n+1): # n times! :)
	if _ == 0:
	print("Optimal Alignment:\n")

	elif _ == 1:
	print()
	print("Suboptimal Alignments:\n")

	sm, dm = localAlign(s1, s2) # perform alignment
	a1, a2 = printAlignment(sm, dm, s1, s2, local=True) # print alignment, store alignment

	print()

	if not a1 or not a2: # if no alignment found, end iteration
	break

	for x, y in zip(a1, a2): # iterate over the alignment pairs
	if x == "_" or y == "_" or x == "." or y == "." or x == " " or y == " ":
	continue

	for i in range(24): # identify position in scoring array
	if x == aa[i]:
	i1 = i
	if y == aa[i]:
	i2 = i

	# score is now 0, as per hint
	scoring[i1, i2] = 0
	scoring[i2, i1] = 0


	# find and print alignments
	nsubopts(numseq, pro1seq, pro2seq)