Download as pdf or txt
Download as pdf or txt
You are on page 1of 10

Comp 6100 project Report

Submitted by:
Mohammad Bataineh
Benjamin Alikali
Nadia Bourini

1. Find the distribution of phi and psi angles for amino acids in helices and
beta sheets (something similar to Ramachandran plot).

import math
import requests
import os

class PDB:

def __init__(self, pdb):


self.protein = pdb[:4]
self.file = self.protein+'.txt'
#try to read a protein from a local file
if (os.path.isfile(self.file)):
self.pdb = open(self.file, 'r') #opens pdb file 3UTS.text
self.pdb.close()
#if the protein file doesnt exist, we fetch it from online
else:
url = 'https://files.rcsb.org/view/' + self.protein + '.pdb'
myfile = requests.get(str(url).rstrip()) #copy the information of the url webpage
open(self.file, 'a').write(str(myfile.content).replace("\\n","\n")) #dump the information into
<protein name>.txt
self.chain = pdb[4:].strip() #read the last character of the input which is the chain

def get_seq_numbers(self):
self.pdb = open(self.file, 'r')
lines = self.pdb.readlines()
self.pdb.close()

seqNums = []
# Show the number of atoms in a given aa (aa sequence number should be provided)
for line in lines:
if (line[:4] == 'ATOM') and (line[21:22] == self.chain):
if line[23:26].strip() not in seqNums:
seqNums.append(line[23:26].strip())
return seqNums

def Q1(self, aa):


aaNumber, angletype = aa.split('.')
Cphi = []
N = []
CA = []
C = []
Npsi = []
self.pdb = open(self.file, 'r')
lines = self.pdb.readlines()
self.pdb.close()
for line in lines:
if( line[:4] == 'ATOM' and line[21:22] == self.chain and line[12:16].strip() == 'C' and
int(line[23:26].strip()) == int(aaNumber)-1):
Cphi.append(float(line[30:38].strip()))
Cphi.append(float(line[38:46].strip()))
Cphi.append(float(line[46:54].strip()))
if( line[:4] == 'ATOM' and line[21:22] == self.chain and line[12:16].strip() == 'N' and
line[23:26].strip() == aaNumber):
N.append(float(line[30:38].strip())) #append x
N.append(float(line[38:46].strip())) #append y
N.append(float(line[46:54].strip())) #append z
if( line[:4] == 'ATOM' and line[21:22] == self.chain and line[12:16].strip() == 'CA' and
line[23:26].strip() == aaNumber):
CA.append(float(line[30:38].strip())) #append x
CA.append(float(line[38:46].strip())) #append y
CA.append(float(line[46:54].strip())) #append z
if( line[:4] == 'ATOM' and line[21:22] == self.chain and line[12:16].strip() == 'C' and
line[23:26].strip() == aaNumber):
C.append(float(line[30:38].strip())) #append x
C.append(float(line[38:46].strip())) #append y
C.append(float(line[46:54].strip())) #append z
if( line[:4] == 'ATOM' and line[21:22] == self.chain and line[12:16].strip() == 'N' and
int(line[23:26].strip()) == int(aaNumber)+1):
Npsi.append(float(line[30:38].strip()))
Npsi.append(float(line[38:46].strip()))
Npsi.append(float(line[46:54].strip()))
try:
if angletype == 'phi':
# A1 = y1 ( z2 - z3 ) + y2 ( z3 - z1 ) + y3 ( z1 - z2 )
A1 = Cphi[1]*(N[2]-CA[2]) + N[1]*( CA[2]-Cphi[2] ) + CA[1]*(Cphi[2]- N[2])
# B1 = z1 ( x2 - x3 ) + z2 ( x3 - x1 ) + z3 ( x1 - x2 )
B1 = Cphi[2]*(N[0]-CA[0]) + N[2]*( CA[0]-Cphi[0] ) + CA[2]*(Cphi[0]- N[0])
# C1= x1 ( y2 - y3 ) + x2 ( y3 - y1 ) + x3 ( y1 - y2 )
C1 = Cphi[0]*(N[1]-CA[1]) + N[0]*( CA[1]-Cphi[1] ) + CA[0]*(Cphi[1]- N[1])
# ////////////////////////////
# A1 = y2 ( z3 - z4 ) + y3 ( z4 - z2 ) + y4 ( z2 - z3 )
A2 = ( N[1]*(CA[2]-C[2]) ) + (CA[1]*( C[2]-N[2] )) + (C[1]*(N[2]- CA[2]))
# B2 = z2 ( x3 - x4 ) + z3 ( x4 - x2 ) + z4 ( x2 - x3 )
B2 = (N[2]*(CA[0]-C[0])) + (CA[2]*( C[0]-N[0] )) + (C[2]*(N[0]- CA[0]))
# C2 = x2 ( y3 - y4 ) + x3 ( y4 - y2 ) + x4 ( y2 - y3 )
C2 = (N[0]*(CA[1]-C[1])) + (CA[0]*( C[1]-N[1] )) + (C[0]*(N[1]- CA[1]))
# //////////////////////////
angle = (A1*A2 + B1*B2 + C1*C2)/ (math.sqrt(A1*A1 + B1*B1 + C1*C1) *
math.sqrt(A2*A2 + B2*B2 + C2*C2))
# ////////////////// vNormal = the cross product of v1 and v2
V1 = [Cphi[0]-N[0], Cphi[1]-N[1], Cphi[2]-N[2]]
V2 = [CA[0]-N[0], CA[1]-N[1], CA[2]-N[2]]
V3 = [C[0]-CA[0], C[1]-CA[1], C[2]-CA[2]]
vNormal = []
vNormal.append( (V1[1]*V2[2] - V1[2]*V2[1]) )
vNormal.append( -(V1[0]*V2[2] - V1[2]*V2[0]) )
vNormal.append( (V1[0]*V2[1] - V1[1]*V2[0]) )
dotProduct = (V3[0]*vNormal[0] + V3[1]*vNormal[1] + V3[2]*vNormal[2])
# //////////////// final angle /// (acos(angle)* 180)/PI
finalangle = round(((math.acos(angle)*180) / math.pi),4)
if(dotProduct > 0):
return -1*finalangle
else:
return finalangle

if angletype == 'psi':
# A1 = y1 ( z2 - z3 ) + y2 ( z3 - z1 ) + y3 ( z1 - z2 )
A1 = N[1]*(CA[2]-C[2]) + CA[1]*( C[2]-N[2] ) + C[1]*(N[2]- CA[2])
# B1 = z1 ( x2 - x3 ) + z2 ( x3 - x1 ) + z3 ( x1 - x2 )
B1 = N[2]*(CA[0]-C[0]) + CA[2]*( C[0]-N[0] ) + C[2]*(N[0]- CA[0])
# C1= x1 ( y2 - y3 ) + x2 ( y3 - y1 ) + x3 ( y1 - y2 )
C1 = N[0]*(CA[1]-C[1]) + CA[0]*( C[1]-N[1] ) + C[0]*(N[1]- CA[1])
# ////////////////////////////
# A2 = y2 ( z3 - z4 ) + y3 ( z4 - z2 ) + y4 ( z2 - z3 )
A2 = (CA[1]*(C[2]-Npsi[2])) + (C[1]*( Npsi[2]-CA[2] )) + (Npsi[1]*(CA[2]- C[2]))
# B2 = z2 ( x3 - x4 ) + z3 ( x4 - x2 ) + z4 ( x2 - x3 )
B2 = (CA[2]*(C[0]-Npsi[0])) + (C[2]*( Npsi[0]-CA[0] )) + (Npsi[2]*(CA[0]- C[0]))
# C2 = x2 ( y3 - y4 ) + x3 ( y4 - y2 ) + x4 ( y2 - y3 )
C2 = (CA[0]*(C[1]-Npsi[1])) + (C[0]*( Npsi[1]-CA[1] )) + (Npsi[0]*(CA[1]- C[1]))
# //////////////////////////
angle = (A1*A2 + B1*B2 + C1*C2)/ (math.sqrt(A1*A1 + B1*B1 + C1*C1) *
math.sqrt(A2*A2 + B2*B2 + C2*C2))
# ////////////////// vNormal = the cross product of v1 and v2
V1 = [CA[0]-N[0], CA[1]-N[1], CA[2]-N[2]]
V2 = [C[0]-CA[0], C[1]-CA[1], C[2]-CA[2]]
V3 = [Npsi[0]-C[0], Npsi[1]-C[1], Npsi[2]-C[2]]
vNormal = []
vNormal.append( (V1[1]*V2[2] - V1[2]*V2[1]) )
vNormal.append( -(V1[0]*V2[2] - V1[2]*V2[0]) )
vNormal.append( (V1[0]*V2[1] - V1[1]*V2[0]) )
dotProduct = (V3[0]*vNormal[0] + V3[1]*vNormal[1] + V3[2]*vNormal[2])
# //////////////// final angle /// (acos(angle)* 180)/PI
finalangle = round(((math.acos(angle)*180) / math.pi),4)
if(dotProduct < 0):
return -1*finalangle
else:
return finalangle
except:
print("Give amino acid or chain doesn't exist")

PDBf = PDB('3UTSA')
print(PDBf.get_seq_numbers())

seqNums = PDBf.get_seq_numbers()

phis = []
psis = []

for num in seqNums:

phiAngle = PDBf.Q1(num+".phi")
psiAngle = PDBf.Q1(num+".psi")

if phiAngle is not None:


print ("phiAngle = ", phiAngle)
print ("psiAngle = ", psiAngle)
phis.append(phiAngle)
psis.append(psiAngle)

print (phis)
print (psis)

from pandas import DataFrame


import matplotlib.pyplot as plt

plt.scatter(phis, psis)
plt.show()

2. Find the distribution of phi and psi for each individual amino acid.
Highlight the distribution when the amino acid is within a helix or a sheet.

Paul did not do anything.


3. Analyze the length of helices (in terms of number of AAs). Is there any
relation between the type of a helix (alpha, 3-10,…,etc.) and its length?

his code is used to download the desire files from pdb bank and keep it on our local
machine due to the big size of that database (1.7 G), this code should be run only one
time.

import urllib.request
import requests

file = open("cullpdb_pc20_res1.6_R0.25_d200326_chains3655.gz", 'r')


next(file)
protein = file.readlines()
file.close()
i=1
for line in protein:
File='https://files.rcsb.org/view/'+ line[0:4].upper() + '.pdb'
urllib.request.urlretrieve(File, line[0:4]+'.pdb')
print(i)
i=i+1
This code is used to analyze and calculate different helix types
file = open("cullpdb_pc20_res1.6_R0.25_d200326_chains3655.gz", 'r')
next(file)
protein = file.readlines()
file.close()
bank=[]
chain=[]
count=[0,0,0,0,0,0,0,0,0,0,0]
sum=[0,0,0,0,0,0,0,0,0,0,0]
for line in protein:
bank.append(line[0:4]+'.pdb')
chain.append(line[4:7].strip())
for i in range(len(bank)):
file = open(bank[i], 'r')
pdb = file.readlines()
file.close()
for line in pdb:
if line[0:5] == "HELIX":
if line[19:20] and line[31:32] == chain[i]:
if line[39:40].strip() == '1':
count[int(line[39:40].strip())]=count[int(line[39:40].strip())] + 1
sum[int(line[39:40].strip())]= sum[int(line[39:40].strip())] +
int(line[72:76].strip())
elif line[39:40].strip() == '2':
count[int(line[39:40].strip())]=count[int(line[39:40].strip())] + 1
sum[int(line[39:40].strip())]= sum[int(line[39:40].strip())] +
int(line[72:76].strip())
elif line[39:40].strip() == '3':
count[int(line[39:40].strip())]=count[int(line[39:40].strip())] + 1
sum[int(line[39:40].strip())]= sum[int(line[39:40].strip())] +
int(line[72:76].strip())
elif line[39:40].strip() == '4':
count[int(line[39:40].strip())]=count[int(line[39:40].strip())] + 1
sum[int(line[39:40].strip())]= sum[int(line[39:40].strip())] +
int(line[72:76].strip())
elif line[39:40].strip() == '5':
count[int(line[39:40].strip())]=count[int(line[39:40].strip())] + 1
sum[int(line[39:40].strip())]= sum[int(line[39:40].strip())] +
int(line[72:76].strip())
elif line[39:40].strip() == '6':
count[int(line[39:40].strip())]=count[int(line[39:40].strip())] + 1
sum[int(line[39:40].strip())]= sum[int(line[39:40].strip())] +
int(line[72:76].strip())
elif line[39:40].strip() == '7':
count[int(line[39:40].strip())]=count[int(line[39:40].strip())] + 1
sum[int(line[39:40].strip())]= sum[int(line[39:40].strip())] +
int(line[72:76].strip())
elif line[39:40].strip() == '8':
count[int(line[39:40].strip())]=count[int(line[39:40].strip())] + 1
sum[int(line[39:40].strip())]= sum[int(line[39:40].strip())] +
int(line[72:76].strip())
elif line[39:40].strip() == '9':
count[int(line[39:40].strip())]=count[int(line[39:40].strip())] + 1
sum[int(line[39:40].strip())]= sum[int(line[39:40].strip())] +
int(line[72:76].strip())
elif line[39:40].strip() == '10':
count[int(line[39:40].strip())]=count[int(line[39:40].strip())] + 1
sum[int(line[39:40].strip())]= sum[int(line[39:40].strip())] +
int(line[72:76].strip())
for i in range(len(count)):
if count[i]!=0:
if i==1:
Type= 'Right-handed alpha (default)'
elif i==2:
Type= 'Right-handed omega'
elif i==3:
Type= 'Right-handed pi'
elif i==4:
Type= 'Right-handed gamma'
elif i==5:
Type= 'Right-handed 3 - 10'
elif i==6:
Type= 'Left-handed alpha'
elif i==7:
Type= 'Left-handed omega'
elif i==8:
Type= 'Left-handed gamma'
elif i==9:
Type= '2 - 7 ribbon/helix'
elif i==10:
Type= 'Polyproline'
print("We found", count[i], "helix of",Type,'that has class number',i,
'\nand the average length for this type was:', sum[i]/count[i],'\n')
print('\n\nNo other helix types have been found')

Results sample

4. Find the distribution of the distance between any two consecutive Ca atoms.

import numpy as np
import pandas as pd
file = open('3UTS.pdb', 'r')
pdb = file.readlines()
file.close()
def cord():

chain = input('please input your chain:').upper()

xyz_list = []
final_dist =[]
for line in pdb:
if line[:4] == "ATOM":
if line[13:15] == "CA":
if line[21:22] == chain:
x1, y1, z1 = line[32:54].split()
x = float(x1)
y = float (y1)
z = float (z1)
xyz_list.append([x,y,z])
print(len(xyz_list))

for i in range(len(xyz_list)-1):
j=i+1
p1 = np.array([xyz_list[i][0], xyz_list[i][1], xyz_list[i][2]])
p2 = np.array([xyz_list[j][0], xyz_list[j][1], xyz_list[j][2]])
p1p2_min = p1 - p2
distance = np.sum((p1p2_min) ** 2, axis=0)
dist = np.sqrt(distance)
final_dist.append(dist)
df = pd.DataFrame(final_dist, columns=['distance'])
print(df)
cord()

You might also like