Source code for frustratometer.classes.AWSEM

import numpy as np
from ..utils import _path
from .. import frustration
from .Frustratometer import Frustratometer
from .Gamma import Gamma
from pydantic import BaseModel, Field, ConfigDict
from pydantic.types import Path
from typing import List,Optional,Union

__all__ = ['AWSEM']

class AWSEMParameters(BaseModel):
    model_config = ConfigDict(extra='ignore', arbitrary_types_allowed=True)
    """Default parameters for AWSEM energy calculations."""
    k_contact: float = Field(4.184, description="Coefficient for contact potential. (kJ/mol)")
    
    #Density
    eta: float = Field(5.0, description="Sharpness of the distance-based switching function (Angstrom^-1).")
    rho_0: float = Field(2.6, description="Density cutoff defining buried residues.")
    min_sequence_separation_rho: Optional[int] = Field(2, description="Minimum sequence separation for density calculation.")

    #Burial potential
    burial_in_context: Optional[bool] = Field(True, description="For substructure objects, this may include interactions from remainder of protein in burial term.")
    burial_kappa: float = Field(4.0, description="Sharpness of the density-based switching function for the burial potential wells")
    burial_ro_min: List[float] = Field([0.0, 3.0, 6.0], description="Minimum radii for burial potential wells. (Angstrom)")
    burial_ro_max: List[float] = Field([3.0, 6.0, 9.0], description="Maximum radii for burial potential wells. (Angstrom)")
    
    #Direct contacts
    min_sequence_separation_contact: Optional[int] = Field(0, description="Minimum sequence separation for contact calculation.")
    distance_cutoff_contact: Optional[float] = Field(9.5, description="Distance cutoff for contact calculation. (Angstrom)")
    r_min: float = Field(4.5, description="Minimum distance for direct contact potential. (Angstrom)")
    r_max: float = Field(6.5, description="Maximum distance for direct contact potential. (Angstrom)")
    
    #Mediated contacts
    gamma: Union[Path,Gamma] = Field(_path/'data'/'AWSEM_2015.json', description="File or Gamma object containing the Gamma values")
    r_minII: float = Field(6.5, description="Minimum distance for mediated contact potential. (Angstrom)")
    r_maxII: float = Field(9.5, description="Maximum distance for mediated contact potential. (Angstrom)")
    eta_sigma: float = Field(7.0, description="Sharpness of the density-based switching function between protein-mediated and water-mediated contacts.")
    

    #Membrane
    membrane_gamma: Union[Path,Gamma] = Field(_path/'data'/'AWSEM_membrane_2015.json', description="File or Gamma object containing the membrane Gamma values (for membrane proteins)")
    eta_switching: int = Field(10, description="Switching distance for the membrane switching function")

    #Electrostatics
    min_sequence_separation_electrostatics: Optional[int] = Field(1, description="Minimum sequence separation for electrostatics calculation.")
    k_electrostatics: float = Field(17.3636, description="Coefficient for electrostatic interactions. (kJ/mol)")
    electrostatics_screening_length: float = Field(10, description="Screening length for electrostatic interactions. (Angstrom)")

[docs] class AWSEM(Frustratometer): #Mapping to DCA q = 20 aa_map_awsem_list = [0, 0, 4, 3, 6, 13, 7, 8, 9, 11, 10, 12, 2, 14, 5, 1, 15, 16, 19, 17, 18] #A gap has no energy aa_map_awsem_x, aa_map_awsem_y = np.meshgrid(aa_map_awsem_list, aa_map_awsem_list, indexing='ij') def __init__(self, pdb_structure: object, sequence: str =None, expose_indicator_functions: bool=False, **parameters)->object: """ Generate AWSEM object Parameters ---------- pdb_structure : object Structure object generated by Structure class sequence : str The amino acid sequence of the protein. The sequence is assumed to be in one-letter code. expose_indicator_functions: bool If set to True, indicator functions of the contact and burial energy terms can be accessed by user. Returns ------- AWSEM object """ #Set attributes p = AWSEMParameters(**parameters) if p.min_sequence_separation_contact is None: p.min_sequence_separation_contact = 1 if p.min_sequence_separation_rho is None: p.min_sequence_separation_rho = 1 if p.min_sequence_separation_electrostatics is None: p.min_sequence_separation_electrostatics = 1 for field, value in p: setattr(self, field, value) #Gamma parameters if isinstance(p.gamma, Gamma): gamma = p.gamma elif isinstance(p.gamma, Path): gamma = Gamma(p.gamma) else: raise ValueError("Gamma parameter must be a path or a Gamma object.") self.gamma=gamma self.burial_gamma = gamma['Burial'].T self.direct_gamma = gamma['Direct'][0] self.protein_gamma = gamma['Protein'][0] self.water_gamma = gamma['Water'][0] self.burial_in_context=p.burial_in_context #Structure details self.full_to_aligned_index_dict=pdb_structure.full_to_aligned_index_dict if sequence is None: self.sequence=pdb_structure.sequence else: self.sequence=sequence self.structure=pdb_structure.structure self.chain=pdb_structure.chain self.pdb_file=pdb_structure.pdb_file self.init_index_shift=pdb_structure.init_index_shift self.distance_matrix=pdb_structure.distance_matrix self.full_pdb_distance_matrix=pdb_structure.full_pdb_distance_matrix selection_CB = self.structure.select('name CB or (resname GLY IGL and name CA)') resid = selection_CB.getResindices() self.resid=resid self.N=len(self.resid) assert self.N == len(self.sequence), "The pdb is incomplete. Try setting 'repair_pdb=True' when constructing the Structure object." if self.burial_in_context==True: selected_matrix=self.full_pdb_distance_matrix else: selected_matrix=self.distance_matrix sequence_mask_rho = frustration.compute_mask(selected_matrix, maximum_contact_distance=None, minimum_sequence_separation = p.min_sequence_separation_rho) sequence_mask_contact = frustration.compute_mask(self.distance_matrix, maximum_contact_distance=p.distance_cutoff_contact, minimum_sequence_separation = p.min_sequence_separation_contact) self._decoy_fluctuation = {} self.minimally_frustrated_threshold=.78 # Calculate rho rho = 0.25 rho *= (1 + np.tanh(p.eta * (selected_matrix- p.r_min))) rho *= (1 + np.tanh(p.eta * (p.r_max - selected_matrix))) rho *= sequence_mask_rho self.rho=rho #Calculate sigma water rho_r = (rho).sum(axis=1) if self.full_pdb_distance_matrix.shape!=self.distance_matrix.shape: if self.burial_in_context==True: self.init_index_shift=pdb_structure.init_index_shift self.fin_index_shift=pdb_structure.fin_index_shift rho_r=rho_r[self.init_index_shift:self.fin_index_shift] self.rho_r=rho_r rho_b = np.expand_dims(rho_r, 1) rho1 = np.expand_dims(rho_r, 0) rho2 = np.expand_dims(rho_r, 1) sigma_water = 0.25 * (1 - np.tanh(p.eta_sigma * (rho1 - p.rho_0))) * (1 - np.tanh(p.eta_sigma * (rho2 - p.rho_0))) sigma_protein = 1 - sigma_water #Calculate theta and indicators theta = 0.25 * (1 + np.tanh(p.eta * (self.distance_matrix - p.r_min))) * (1 + np.tanh(p.eta * (p.r_max - self.distance_matrix))) thetaII = 0.25 * (1 + np.tanh(p.eta * (self.distance_matrix - p.r_minII))) * (1 + np.tanh(p.eta * (p.r_maxII - self.distance_matrix))) burial_indicator = np.tanh(p.burial_kappa * (rho_b - p.burial_ro_min)) + np.tanh(p.burial_kappa * (p.burial_ro_max - rho_b)) direct_indicator = theta[:, :, np.newaxis, np.newaxis] water_indicator = thetaII[:, :, np.newaxis, np.newaxis] * sigma_water[:, :, np.newaxis, np.newaxis] protein_indicator = thetaII[:, :, np.newaxis, np.newaxis] * sigma_protein[:, :, np.newaxis, np.newaxis] if expose_indicator_functions: self.indicators=[] self.indicators.append(burial_indicator[:,0]) self.indicators.append(burial_indicator[:,1]) self.indicators.append(burial_indicator[:,2]) self.indicators.append(direct_indicator[:,:,0,0]*sequence_mask_contact) self.indicators.append(protein_indicator[:,:,0,0]*sequence_mask_contact) self.indicators.append(water_indicator[:,:,0,0]*sequence_mask_contact) self.gamma_array=[] temp_burial_gamma=self.burial_gamma[self.aa_map_awsem_list] temp_burial_gamma[0]=0 temp_burial_gamma *= -0.5 * p.k_contact self.gamma_array.append(temp_burial_gamma[:,0]) self.gamma_array.append(temp_burial_gamma[:,1]) self.gamma_array.append(temp_burial_gamma[:,2]) for contact_gamma in [self.direct_gamma, self.protein_gamma, self.water_gamma]: temp_gamma = contact_gamma[self.aa_map_awsem_x, self.aa_map_awsem_y].copy() temp_gamma[0, :] = 0 temp_gamma[:, 0] = 0 temp_gamma *= -0.5 * self.k_contact self.gamma_array.append(temp_gamma) self.burial_indicator = burial_indicator self.direct_indicator = direct_indicator self.water_indicator = water_indicator self.protein_indicator = protein_indicator J_index = np.meshgrid(range(self.N), range(self.N), range(self.q), range(self.q), indexing='ij', sparse=False) h_index = np.meshgrid(range(self.N), range(self.q), indexing='ij', sparse=False) #Burial energy burial_energy = 0.5 * p.k_contact * self.burial_gamma[h_index[1]] * burial_indicator[:, np.newaxis, :] self.burial_energy = burial_energy #Contact energy direct = direct_indicator * self.direct_gamma[J_index[2], J_index[3]] water_mediated = water_indicator * self.water_gamma[J_index[2], J_index[3]] protein_mediated = protein_indicator * self.protein_gamma[J_index[2], J_index[3]] contact_energy = p.k_contact * np.array([direct, water_mediated, protein_mediated]) * sequence_mask_contact[np.newaxis, :, :, np.newaxis, np.newaxis] # Compute electrostatics if p.k_electrostatics!=0: self.sequence_cutoff=min(p.min_sequence_separation_electrostatics, p.min_sequence_separation_contact) self.distance_cutoff=None electrostatics_mask = frustration.compute_mask(self.distance_matrix, maximum_contact_distance=None, minimum_sequence_separation=p.min_sequence_separation_electrostatics) # ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] charges = np.array([0, 1, 0, -1, 0, 0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]) charges2 = charges[:,np.newaxis]*charges[np.newaxis,:] electrostatics_indicator = 1 / (self.distance_matrix + 1E-6) * np.exp(-self.distance_matrix / p.electrostatics_screening_length) * electrostatics_mask electrostatics_energy = -p.k_electrostatics * (charges2[np.newaxis,np.newaxis,:,:]*electrostatics_indicator[:,:,np.newaxis,np.newaxis]) contact_energy = np.append(contact_energy, electrostatics_energy[np.newaxis,:,:,:,:], axis=0) if expose_indicator_functions: self.indicators.append(electrostatics_indicator) temp_gamma=0.5 * p.k_electrostatics * charges2[self.aa_map_awsem_x, self.aa_map_awsem_y] temp_gamma[0,:]=0 temp_gamma[:,0]=0 self.gamma_array.append(temp_gamma) else: self.sequence_cutoff=p.min_sequence_separation_contact self.distance_cutoff=p.distance_cutoff_contact self.mask = frustration.compute_mask(self.distance_matrix, maximum_contact_distance=self.distance_cutoff, minimum_sequence_separation = self.sequence_cutoff) self.contact_energy = contact_energy # Compute fast properties self.aa_freq = frustration.compute_aa_freq(self.sequence) self.contact_freq = frustration.compute_contact_freq(self.sequence) self.potts_model = {} self.potts_model['h'] = burial_energy.sum(axis=-1)[:, self.aa_map_awsem_list] self.potts_model['J'] = contact_energy.sum(axis=0)[:, :, self.aa_map_awsem_x, self.aa_map_awsem_y] # Set the gap energy to zero self.potts_model['h'][:, 0] = 0 self.potts_model['J'][:, :, 0, :] = 0 self.potts_model['J'][:, :, :, 0] = 0 self._native_energy=None
[docs] def compute_configurational_decoy_statistics(self, n_decoys=4000,aa_freq=None): # ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] _AA='ARNDCQEGHILKMFPSTWYV' if aa_freq is None: seq_index = np.array([_AA.find(aa) for aa in self.sequence]) N=self.N else: N=self.N*10 total = sum(aa_freq) probabilities = [freq / total for freq in aa_freq.ravel()] seq_index = np.random.choice(a=len(aa_freq), size=N, p=probabilities) distances = np.triu(self.distance_matrix) distances = distances[(distances<self.distance_cutoff_contact) & (distances>0)] rho_b = np.expand_dims(self.rho_r, 1) #(n,1) rho1 = np.expand_dims(self.rho_r, 0) #(1,n) rho2 = np.expand_dims(self.rho_r, 1) #(n,1) sigma_water = 0.25 * (1 - np.tanh(self.eta_sigma * (rho1 - self.rho_0))) * (1 - np.tanh(self.eta_sigma * (rho2 - self.rho_0))) #(n,n) sigma_protein = 1 - sigma_water #(n,n) #Calculate theta and indicators theta = 0.25 * (1 + np.tanh(self.eta * (distances - self.r_min))) * (1 + np.tanh(self.eta * (self.r_max - distances))) # (c,) thetaII = 0.25 * (1 + np.tanh(self.eta * (distances - self.r_minII))) * (1 + np.tanh(self.eta * (self.r_maxII - distances))) #(c,) burial_indicator = np.tanh(self.burial_kappa * (rho_b - self.burial_ro_min)) + np.tanh(self.burial_kappa * (self.burial_ro_max - rho_b)) #(n,3) charges = np.array([0, 1, 0, -1, 0, 0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]) electrostatics_indicator = np.exp(-distances / self.electrostatics_screening_length) / distances decoy_energies=np.zeros(n_decoys) #decoy_data=[None]*n_decoys #decoy_data_columns=['decoy_i','rand_i_resno','rand_j_resno','ires_type','jres_type','i_resno','j_resno','rij','rho_i','rho_j','water_energy','burial_energy_i','burial_energy_j','electrostatic_energy','tert_frust_decoy_energies'] for i in range(n_decoys): c=np.random.randint(0,len(distances)) n1=np.random.randint(0,self.N) n2=np.random.randint(0,self.N) qi1=np.random.randint(0,N) qi2=np.random.randint(0,N) q1=seq_index[qi1] q2=seq_index[qi2] burial_energy1 = (-0.5 * self.k_contact * self.burial_gamma[q1] * burial_indicator[n1]).sum(axis=0) burial_energy2 = (-0.5 * self.k_contact * self.burial_gamma[q2] * burial_indicator[n2]).sum(axis=0) direct = theta[c] * self.direct_gamma[q1, q2] water_mediated = sigma_water[n1,n2] * thetaII[c] * self.water_gamma[q1,q2] protein_mediated = sigma_protein[n1,n2] * thetaII[c] * self.protein_gamma[q1,q2] contact_energy = -self.k_contact * (direct+water_mediated+protein_mediated) electrostatics_energy = self.k_electrostatics * electrostatics_indicator[c]*charges[q1]*charges[q2] decoy_energies[i]=(burial_energy1+burial_energy2+contact_energy+electrostatics_energy) #decoy_data[i]=[i, qi1, qi2, q1, q2, n1, n2, distances[c], self.rho_r[n1], self.rho_r[n2], contact_energy/4.184, burial_energy1/4.184, burial_energy2/4.184, electrostatics_energy/4.184, decoy_energies[i]] mean_decoy_energy = np.mean(decoy_energies) std_decoy_energy = np.std(decoy_energies) return mean_decoy_energy, std_decoy_energy
[docs] def compute_configurational_energies(self): _AA='ARNDCQEGHILKMFPSTWYV' seq_index = np.array([_AA.find(aa) for aa in self.sequence]) distances = np.triu(self.distance_matrix) distances = distances[(distances<self.distance_cutoff_contact) & (distances>0)] n_contacts=len(distances) n = self.distance_matrix.shape[0] # Assuming self.distance_matrix is defined and square tri_upper_indices = np.triu_indices(n, k=1) # k=1 excludes the diagonal valid_pairs = (self.distance_matrix[tri_upper_indices] < self.distance_cutoff_contact) & \ (self.distance_matrix[tri_upper_indices] > 0) indices1,indices2 = (tri_upper_indices[0][valid_pairs], tri_upper_indices[1][valid_pairs]) # for n1,n2,c in zip(indices1,indices2,range(n_contacts)): # assert self.distance_matrix[n1,n2] == distances[c] rho_b = np.expand_dims(self.rho_r, 1) #(n,1) rho1 = np.expand_dims(self.rho_r, 0) #(1,n) rho2 = np.expand_dims(self.rho_r, 1) #(n,1) sigma_water = 0.25 * (1 - np.tanh(self.eta_sigma * (rho1 - self.rho_0))) * (1 - np.tanh(self.eta_sigma * (rho2 - self.rho_0))) #(n,n) sigma_protein = 1 - sigma_water #(n,n) #Calculate theta and indicators theta = 0.25 * (1 + np.tanh(self.eta * (distances - self.r_min))) * (1 + np.tanh(self.eta * (self.r_max - distances))) # (c,) thetaII = 0.25 * (1 + np.tanh(self.eta * (distances - self.r_minII))) * (1 + np.tanh(self.eta * (self.r_maxII - distances))) #(c,) burial_indicator = np.tanh(self.burial_kappa * (rho_b - self.burial_ro_min)) + np.tanh(self.burial_kappa * (self.burial_ro_max - rho_b)) #(n,3) charges = np.array([0, 1, 0, -1, 0, 0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]) electrostatics_indicator = np.exp(-distances / self.electrostatics_screening_length) / distances # decoy_data_columns=['decoy_i','i_resno','j_resno','ires_type','jres_type','aa1','aa2','rij','rho_i','rho_j','water_energy','burial_energy_i','burial_energy_j','electrostatic_energy','total_energies'] # decoy_data=[] configurational_energies=np.zeros((n,n)) for c in range(n_contacts): n1=indices1[c] n2=indices2[c] q1=seq_index[n1] q2=seq_index[n2] burial_energy1 = (-0.5 * self.k_contact * self.burial_gamma[q1] * burial_indicator[n1]).sum(axis=0) burial_energy2 = (-0.5 * self.k_contact * self.burial_gamma[q2] * burial_indicator[n2]).sum(axis=0) direct = theta[c] * self.direct_gamma[q1, q2] water_mediated = sigma_water[n1,n2] * thetaII[c] * self.water_gamma[q1,q2] protein_mediated = sigma_protein[n1,n2] * thetaII[c] * self.protein_gamma[q1,q2] contact_energy = -self.k_contact * (direct+water_mediated+protein_mediated) electrostatics_energy = self.k_electrostatics * electrostatics_indicator[c]*charges[q1]*charges[q2] energy=(burial_energy1+burial_energy2+contact_energy+electrostatics_energy) configurational_energies[n1,n2]=energy configurational_energies[n2,n1]=energy # decoy_data+=[[c, n1, n2, q1, q2, _AA[q1],_AA[q2], distances[c], self.rho_r[n1], self.rho_r[n2], contact_energy/4.184, burial_energy1/4.184, burial_energy2/4.184, electrostatics_energy/4.184, energy/4.184]] # import pandas as pd return configurational_energies #, pd.DataFrame(decoy_data, columns=decoy_data_columns)
[docs] def configurational_frustration(self,aa_freq=None, correction=0, n_decoys=4000): mean_decoy_energy, std_decoy_energy = self.compute_configurational_decoy_statistics(n_decoys=n_decoys,aa_freq=aa_freq) return -(self.compute_configurational_energies()-mean_decoy_energy)/(std_decoy_energy+correction)