import prody
import scipy.spatial.distance as sdist
import numpy as np
from typing import Union
from pathlib import Path
_AA = '-ACDEFGHIKLMNPQRSTVWY'
[docs]
def compute_mask(distance_matrix: np.array,
maximum_contact_distance: Union[float, None] = None,
minimum_sequence_separation: Union[int, None] = None) -> np.array:
"""
Computes a 2D Boolean mask from a given distance matrix based on a distance cutoff and a sequence separation cutoff.
Parameters
----------
distance_matrix : np.array
A 2D array where the element at index [i, j] represents the spatial distance
between residues i and j. This matrix is assumed to be symmetric.
maximum_contact_distance : float, optional
The maximum distance of a contact. Pairs of residues with distances less than this
threshold are marked as True in the mask. If None, the spatial distance criterion
is ignored and all distances are included. Default is None.
minimum_sequence_separation : int, optional
A minimum sequence distance threshold. Pairs of residues with sequence indices
differing by at least this value are marked as True in the mask. If None,
the sequence distance criterion is ignored. Default is None.
Returns
-------
mask : np.array
A 2D Boolean array of the same dimensions as `distance_matrix`. Elements of the mask
are True where the residue pairs meet the specified `distance_cutoff` and
`sequence_distance_cutoff` criteria.
Examples
--------
>>> import numpy as np
>>> dm = np.array([[0, 1, 2], [1, 0, 1], [2, 1, 0]])
>>> print(compute_mask(dm, distance_cutoff=1.5, sequence_distance_cutoff=1))
[[False True False]
[ True False True]
[False True False]]
.. todo:: Add chain information for sequence separation
"""
seq_len = len(distance_matrix)
mask = np.ones([seq_len, seq_len])
if minimum_sequence_separation is not None:
sequence_distance = sdist.squareform(sdist.pdist(np.arange(seq_len)[:, np.newaxis]))
mask *= sequence_distance >= minimum_sequence_separation
if maximum_contact_distance is not None:
mask *= distance_matrix <= maximum_contact_distance
return mask.astype(np.bool_)
[docs]
def compute_native_energy(seq: str,
potts_model: dict,
mask: np.array,
ignore_gap_couplings: bool = False,
ignore_gap_fields: bool = False) -> float:
"""
Computes the native energy of a protein sequence based on a given Potts model and an interaction mask.
.. math::
E = \\sum_i h_i + \\frac{1}{2} \\sum_{i,j} J_{ij} \\Theta_{ij}
Parameters
----------
seq : str
The amino acid sequence of the protein. The sequence is assumed to be in one-letter code. Gaps are represented as '-'. The length of the sequence (L) should match the dimensions of the Potts model.
potts_model : dict
A dictionary containing the Potts model parameters 'h' (fields) and 'J' (couplings). The fields are a 2D array of shape (L, 20), where L is the length of the sequence and 20 is the number of amino acids. The couplings are a 4D array of shape (L, L, 20, 20). The fields and couplings are assumed to be in units of energy.
mask : np.array
A 2D Boolean array that determines which residue pairs should be considered in the energy computation. The mask should have dimensions (L, L), where L is the length of the sequence.
ignore_couplings_of_gaps : bool, optional
If True, couplings involving gaps ('-') in the sequence are set to 0 in the energy calculation.
Default is False.
ignore_fields_of_gaps : bool, optional
If True, fields corresponding to gaps ('-') in the sequence are set to 0 in the energy calculation.
Default is False.
Returns
-------
energy : float
The computed energy of the protein sequence based on the Potts model and the interaction mask.
Examples
--------
>>> seq = "ACDEFGHIKLMNPQRSTVWY"
>>> potts_model = {
'h': np.random.rand(20, 20), # Random fields
'J': np.random.rand(20, 20, 20, 20) # Random couplings
}
>>> mask = np.ones((len(seq), len(seq)), dtype=bool) # Include all pairs
>>> energy = compute_native_energy(seq, potts_model, mask)
>>> print(f"Computed energy: {energy:.2f}")
Notes
-----
The energy is computed as the sum of the fields and the half-sum of the couplings for all pairs of residues
where the mask is True. The division by 2 for the couplings accounts for double-counting in symmetric
matrices.
.. todo:: Optimize the computation.
"""
seq_index = np.array([_AA.find(aa) for aa in seq])
seq_len = len(seq_index)
pos1, pos2 = np.meshgrid(np.arange(seq_len), np.arange(seq_len), indexing='ij', sparse=True)
aa1, aa2 = np.meshgrid(seq_index, seq_index, indexing='ij', sparse=True)
h = -potts_model['h'][range(seq_len), seq_index]
j = -potts_model['J'][pos1, pos2, aa1, aa2]
j_prime = j * mask
gap_indices=[int(i) for i,j in enumerate(seq) if j=="-"]
if ignore_gap_couplings==True:
if len(gap_indices)>0:
j_prime[gap_indices,:]=False
j_prime[:,gap_indices]=False
if ignore_gap_fields==True:
if len(gap_indices)>0:
h[gap_indices]=False
energy = h.sum() + j_prime.sum() / 2
return energy
[docs]
def compute_fields_energy(seq: str,
potts_model: dict,
ignore_fields_of_gaps: bool = False) -> float:
"""
Computes the fields energy of a protein sequence based on a given Potts model.
.. math::
E = \\sum_i h_i
Parameters
----------
seq : str
The amino acid sequence of the protein. The sequence is assumed to be in one-letter code. Gaps are represented as '-'. The length of the sequence (L) should match the dimensions of the Potts model.
potts_model : dict
A dictionary containing the Potts model parameters 'h' (fields) and 'J' (couplings). The fields are a 2D array of shape (L, 20), where L is the length of the sequence and 20 is the number of amino acids. The couplings are a 4D array of shape (L, L, 20, 20). The fields and couplings are assumed to be in units of energy.
ignore_fields_of_gaps : bool, optional
If True, fields corresponding to gaps ('-') in the sequence are set to 0 in the energy calculation.
Default is False.
Returns
-------
fields_energy : float
The computed fields energy of the protein sequence based on the Potts model
Examples
--------
>>> seq = "ACDEFGHIKLMNPQRSTVWY"
>>> potts_model = {
'h': np.random.rand(20, 20), # Random fields
'J': np.random.rand(20, 20, 20, 20) # Random couplings
}
>>> fields_energy = compute_fields_energy(seq, potts_model)
>>> print(f"Computed fields energy: {fields_energy:.2f}")
"""
seq_index = np.array([_AA.find(aa) for aa in seq])
seq_len = len(seq_index)
h = -potts_model['h'][range(seq_len), seq_index]
if ignore_fields_of_gaps==True:
gap_indices=[int(i) for i,j in enumerate(seq) if j=="-"]
if len(gap_indices)>0:
h[gap_indices]=False
fields_energy=h.sum()
return fields_energy
[docs]
def compute_couplings_energy(seq: str,
potts_model: dict,
mask: np.array,
ignore_couplings_of_gaps: bool = False) -> float:
"""
Computes the couplings energy of a protein sequence based on a given Potts model and an interaction mask.
.. math::
E = \\frac{1}{2} \\sum_{i,j} J_{ij} \\Theta_{ij}
Parameters
----------
seq : str
The amino acid sequence of the protein. The sequence is assumed to be in one-letter code. Gaps are represented as '-'. The length of the sequence (L) should match the dimensions of the Potts model.
potts_model : dict
A dictionary containing the Potts model parameters 'h' (fields) and 'J' (couplings). The fields are a 2D array of shape (L, 20), where L is the length of the sequence and 20 is the number of amino acids. The couplings are a 4D array of shape (L, L, 20, 20). The fields and couplings are assumed to be in units of energy.
mask : np.array
A 2D Boolean array that determines which residue pairs should be considered in the energy computation. The mask should have dimensions (L, L), where L is the length of the sequence.
ignore_couplings_of_gaps : bool, optional
If True, couplings involving gaps ('-') in the sequence are set to 0 in the energy calculation.
Default is False.
Returns
-------
couplings_energy : float
The computed couplings energy of the protein sequence based on the Potts model and the interaction mask.
Examples
--------
>>> seq = "ACDEFGHIKLMNPQRSTVWY"
>>> potts_model = {
'h': np.random.rand(20, 20), # Random fields
'J': np.random.rand(20, 20, 20, 20) # Random couplings
}
>>> mask = np.ones((len(seq), len(seq)), dtype=bool) # Include all pairs
>>> couplings_energy = compute_couplings_energy(seq, potts_model, mask)
>>> print(f"Computed couplings energy: {couplings_energy:.2f}")
Notes
-----
The couplings energy is computed as the half-sum of the couplings for all pairs of residues
where the mask is True. The division by 2 for the couplings accounts for double-counting in symmetric
matrices.
.. todo:: Optimize the computation.
"""
seq_index = np.array([_AA.find(aa) for aa in seq])
seq_len = len(seq_index)
pos1, pos2 = np.meshgrid(np.arange(seq_len), np.arange(seq_len), indexing='ij', sparse=True)
aa1, aa2 = np.meshgrid(seq_index, seq_index, indexing='ij', sparse=True)
j = -potts_model['J'][pos1, pos2, aa1, aa2]
j_prime = j * mask
if ignore_couplings_of_gaps==True:
gap_indices=[i for i,j in enumerate(seq) if j=="-"]
if len(gap_indices)>0:
j_prime[:,gap_indices]=False
j_prime[gap_indices,:]=False
couplings_energy = j_prime.sum() / 2
return couplings_energy
[docs]
def compute_sequences_energy(seqs: list,
potts_model: dict,
mask: np.array,
split_couplings_and_fields = False) -> np.array:
"""
Computes the energy of multiple protein sequences based on a given Potts model and an interaction mask.
.. math::
E = \\sum_i h_i + \\frac{1}{2} \\sum_{i,j} J_{ij} \\Theta_{ij}
Parameters
----------
seqs : list
List of amino acid sequences in string format, separated by commas. The sequences are assumed to be in one-letter code. Gaps are represented as '-'. The length of each sequence (L) should all match the dimensions of the Potts model.
potts_model : dict
A dictionary containing the Potts model parameters 'h' (fields) and 'J' (couplings). The fields are a 2D array of shape (L, 20), where L is the length of the sequences and 20 is the number of amino acids. The couplings are a 4D array of shape (L, L, 20, 20). The fields and couplings are assumed to be in units of energy.
mask : np.array
A 2D Boolean array that determines which residue pairs should be considered in the energy computation. The mask should have dimensions (L, L), where L is the length of the sequences.
split_couplings_and_fields : bool, optional
If True, two lists of the sequences' couplings and fields energies are returned.
Default is False.
Returns
-------
energy (if split_couplings_and_fields==False): float
The computed energies of the protein sequences based on the Potts model and the interaction mask.
fields_couplings_energy (if split_couplings_and_fields==True): np.array
Array containing computed fields and couplings energies of the protein sequences based on the Potts model and the interaction mask.
Examples
--------
>>> seq_list = ["ACDEFGHIKLMNPQRSTVWY","AKLWYMNPQRSTCDEFGHIV"]
>>> potts_model = {
'h': np.random.rand(20, 20), # Random fields
'J': np.random.rand(20, 20, 20, 20) # Random couplings
}
>>> mask = np.ones((len(seq_list[0]), len(seq_list[0])), dtype=bool) # Include all pairs
>>> energies = compute_sequences_energy(seq_list, potts_model, mask)
>>> print(f"Sequence 1 energy: {energies[0]:.2f}")
>>> print(f"Sequence 2 energy: {energies[1]:.2f}")
Notes
-----
The couplings energy is computed as the half-sum of the couplings for all pairs of residues
where the mask is True. The division by 2 for the couplings accounts for double-counting in symmetric
matrices.
.. todo:: Optimize the computation.
"""
seq_index = np.array([[_AA.find(aa) for aa in seq] for seq in seqs])
N_seqs, seq_len = seq_index.shape
pos_index=np.repeat([np.arange(seq_len)], N_seqs,axis=0)
pos1=np.array([np.meshgrid(p, p, indexing='ij', sparse=True)[0] for p in pos_index])
pos2=np.array([np.meshgrid(p, p, indexing='ij', sparse=True)[1] for p in pos_index])
aa1=np.array([np.meshgrid(s, s, indexing='ij', sparse=True)[0] for s in seq_index])
aa2=np.array([np.meshgrid(s, s, indexing='ij', sparse=True)[1] for s in seq_index])
h = -potts_model['h'][pos_index,seq_index]
j = -potts_model['J'][pos1, pos2, aa1, aa2]
j_prime = j * mask
if split_couplings_and_fields:
fields_couplings_energy=np.array([h.sum(axis=-1),j_prime.sum(axis=-1).sum(axis=-1) / 2])
return fields_couplings_energy
else:
energy = h.sum(axis=-1) + j_prime.sum(axis=-1).sum(axis=-1) / 2
return energy
[docs]
def compute_singleresidue_decoy_energy_fluctuation(seq: str,
potts_model: dict,
mask: np.array) -> np.array:
"""
Computes a (Lx21) matrix for a sequence of length L. Row i contains all possible changes in energy upon mutating residue i.
.. math::
\\Delta H_i = \\Delta h_i + \\sum_k \\Delta j_{ik}
Parameters
----------
seq : str
The amino acid sequence of the protein. The sequence is assumed to be in one-letter code. Gaps are represented as '-'. The length of the sequence (L) should match the dimensions of the Potts model.
potts_model : dict
A dictionary containing the Potts model parameters 'h' (fields) and 'J' (couplings). The fields are a 2D array of shape (L, 20), where L is the length of the sequence and 20 is the number of amino acids. The couplings are a 4D array of shape (L, L, 20, 20). The fields and couplings are assumed to be in units of energy.
mask : np.array
A 2D Boolean array that determines which residue pairs should be considered in the energy computation. The mask should have dimensions (L, L), where L is the length of the sequence.
Returns
-------
decoy_energy: np.array
(Lx21) matrix describing the energetic changes upon mutating a single residue.
Examples
--------
>>> seq = "ACDEFGHIKLMNPQRSTVWY"
>>> potts_model = {
'h': np.random.rand(20, 20), # Random fields
'J': np.random.rand(20, 20, 20, 20) # Random couplings
}
>>> mask = np.ones((len(seq), len(seq)), dtype=bool) # Include all pairs
>>> decoy_energy = compute_singleresidue_decoy_energy_fluctuation(seq, potts_model, mask)
>>> print(f"Matrix of Residue Decoy Energy Fluctuations: "); print(decoy_energy)
>>> print(f"Matrix Size: "); print(shape(decoy_energy))
Notes
-----
The couplings energy is computed as the half-sum of the couplings for all pairs of residues
where the mask is True. The division by 2 for the couplings accounts for double-counting in symmetric
matrices.
.. todo:: Optimize the computation.
"""
seq_index = np.array([_AA.find(aa) for aa in seq])
seq_len = len(seq_index)
# Create decoys
pos1, aa1 = np.meshgrid(np.arange(seq_len), np.arange(21), indexing='ij', sparse=True)
decoy_energy = np.zeros([seq_len, 21])
decoy_energy -= (potts_model['h'][pos1, aa1] - potts_model['h'][pos1, seq_index[pos1]]) # h correction aa1
j_correction = np.zeros([seq_len, seq_len, 21])
# J correction interactions with other aminoacids
reduced_j = potts_model['J'][range(seq_len), :, seq_index, :].astype(np.float32)
j_correction += reduced_j[:, pos1, seq_index[pos1]] * mask[:, pos1]
j_correction -= reduced_j[:, pos1, aa1] * mask[:, pos1]
# J correction, interaction with self aminoacids
decoy_energy += j_correction.sum(axis=0)
return decoy_energy
[docs]
def compute_mutational_decoy_energy_fluctuation(seq: str,
potts_model: dict,
mask: np.array, ) -> np.array:
"""
Computes a (LxLx21x21) matrix for a sequence of length L. Matrix[i,j] describes all possible changes in energy upon mutating residue i and j simultaneously.
.. math::
\Delta H_{ij} = H_i - H_{i'} + H_{j}-H_{j'} + J_{ij} -J_{ij'} + J_{i'j'} - J_{i'j} + \\sum_k {J_{ik} - J_{i'k} + J_{jk} -J_{j'k}}
Parameters
----------
seq : str
The amino acid sequence of the protein. The sequence is assumed to be in one-letter code. Gaps are represented as '-'. The length of the sequence (L) should match the dimensions of the Potts model.
potts_model : dict
A dictionary containing the Potts model parameters 'h' (fields) and 'J' (couplings). The fields are a 2D array of shape (L, 20), where L is the length of the sequence and 20 is the number of amino acids. The couplings are a 4D array of shape (L, L, 20, 20). The fields and couplings are assumed to be in units of energy.
mask : np.array
A 2D Boolean array that determines which residue pairs should be considered in the energy computation. The mask should have dimensions (L, L), where L is the length of the sequence.
Returns
-------
decoy_energy2: np.array
(LxLx21x21) matrix describing the energetic changes upon mutating two residues simultaneously.
Examples
--------
>>> seq = "ACDEFGHIKLMNPQRSTVWY"
>>> potts_model = {
'h': np.random.rand(20, 20), # Random fields
'J': np.random.rand(20, 20, 20, 20) # Random couplings
}
>>> mask = np.ones((len(seq), len(seq)), dtype=bool) # Include all pairs
>>> decoy_energy2 = compute_mutational_decoy_energy_fluctuation(seq, potts_model, mask)
>>> print(f"Matrix of Contact Mutational Decoy Energy Fluctuations: "); print(decoy_energy2)
>>> print(f"Matrix Size: "); print(shape(decoy_energy2))
Notes
-----
The couplings energy is computed as the half-sum of the couplings for all pairs of residues
where the mask is True. The division by 2 for the couplings accounts for double-counting in symmetric
matrices.
.. todo:: Optimize the computation.
"""
seq_index = np.array([_AA.find(aa) for aa in seq])
seq_len = len(seq_index)
# Create masked decoys
pos1,pos2=np.where(mask>0)
contacts_len=len(pos1)
pos1,aa1,aa2=np.meshgrid(pos1, np.arange(21), np.arange(21), indexing='ij', sparse=True)
pos2,aa1,aa2=np.meshgrid(pos2, np.arange(21), np.arange(21), indexing='ij', sparse=True)
#Compute fields
decoy_energy = np.zeros([contacts_len, 21, 21])
decoy_energy -= (potts_model['h'][pos1, aa1] - potts_model['h'][pos1, seq_index[pos1]]) # h correction aa1
decoy_energy -= (potts_model['h'][pos2, aa2] - potts_model['h'][pos2, seq_index[pos2]]) # h correction aa2
#Compute couplings
j_correction = np.zeros([contacts_len, 21, 21])
for pos, aa in enumerate(seq_index):
# J correction interactions with other aminoacids
reduced_j = potts_model['J'][pos, :, aa, :].astype(np.float32)
j_correction += reduced_j[pos1, seq_index[pos1]] * mask[pos, pos1]
j_correction -= reduced_j[pos1, aa1] * mask[pos, pos1]
j_correction += reduced_j[pos2, seq_index[pos2]] * mask[pos, pos2]
j_correction -= reduced_j[pos2, aa2] * mask[pos, pos2]
# J correction, interaction with self aminoacids
j_correction -= potts_model['J'][pos1, pos2, seq_index[pos1], seq_index[pos2]] * mask[pos1, pos2] # Taken two times
j_correction += potts_model['J'][pos1, pos2, aa1, seq_index[pos2]] * mask[pos1, pos2] # Added mistakenly
j_correction += potts_model['J'][pos1, pos2, seq_index[pos1], aa2] * mask[pos1, pos2] # Added mistakenly
j_correction -= potts_model['J'][pos1, pos2, aa1, aa2] * mask[pos1, pos2] # Correct combination
decoy_energy += j_correction
decoy_energy2=np.zeros([seq_len,seq_len,21,21])
decoy_energy2[mask]=decoy_energy
return decoy_energy2
[docs]
def compute_configurational_decoy_energy_fluctuation(seq: str,
potts_model: dict,
mask: np.array, ) -> np.array:
"""
Computes a (LxLx21x21) matrix for a sequence of length L. Matrix[i,j] describes all possible changes in energy upon mutating and altering the
local densities of residue i and j simultaneously.
.. math::
\Delta H_{ij} = H_i - H_{i'} + H_{j}-H_{j'} + J_{ij} -J_{ij'} + J_{i'j'} - J_{i'j} + \\sum_k {J_{ik} - J_{i'k} + J_{jk} -J_{j'k}}
Parameters
----------
seq : str
The amino acid sequence of the protein. The sequence is assumed to be in one-letter code. Gaps are represented as '-'. The length of the sequence (L) should match the dimensions of the Potts model.
potts_model : dict
A dictionary containing the Potts model parameters 'h' (fields) and 'J' (couplings). The fields are a 2D array of shape (L, 20), where L is the length of the sequence and 20 is the number of amino acids. The couplings are a 4D array of shape (L, L, 20, 20). The fields and couplings are assumed to be in units of energy.
mask : np.array
A 2D Boolean array that determines which residue pairs should be considered in the energy computation. The mask should have dimensions (L, L), where L is the length of the sequence.
Returns
-------
decoy_energy2: np.array
(LxLx21x21) matrix describing the energetic changes upon mutating and altering the local densities of two residues simultaneously.
Examples
--------
>>> seq = "ACDEFGHIKLMNPQRSTVWY"
>>> potts_model = {
'h': np.random.rand(20, 20), # Random fields
'J': np.random.rand(20, 20, 20, 20) # Random couplings
}
>>> mask = np.ones((len(seq), len(seq)), dtype=bool) # Include all pairs
>>> decoy_energy2 = compute_configurational_decoy_energy_fluctuation(seq, potts_model, mask)
>>> print(f"Matrix of Contact Configurational Decoy Energy Fluctuations: "); print(decoy_energy2)
>>> print(f"Matrix Size: "); print(shape(decoy_energy2))
Notes
-----
The couplings energy is computed as the half-sum of the couplings for all pairs of residues
where the mask is True. The division by 2 for the couplings accounts for double-counting in symmetric
matrices.
.. todo:: Optimize the computation.
"""
seq_index = np.array([_AA.find(aa) for aa in seq])
seq_len = len(seq_index)
# Create masked decoys
pos1,pos2=np.where(mask>0)
contacts_len=len(pos1)
pos1,aa1,aa2=np.meshgrid(pos1, np.arange(21), np.arange(21), indexing='ij', sparse=True)
pos2,aa1,aa2=np.meshgrid(pos2, np.arange(21), np.arange(21), indexing='ij', sparse=True)
#Compute fields
decoy_energy = np.zeros([contacts_len, 21, 21])
decoy_energy -= (potts_model['h'][pos1, aa1] - potts_model['h'][pos1, seq_index[pos1]]) # h correction aa1
decoy_energy -= (potts_model['h'][pos2, aa2] - potts_model['h'][pos2, seq_index[pos2]]) # h correction aa2
#Compute couplings
j_correction = np.zeros([contacts_len, 21, 21])
for pos, aa in enumerate(seq_index):
# J correction interactions with other aminoacids
reduced_j = potts_model['J'][pos, :, aa, :].astype(np.float32)
j_correction += reduced_j[pos1, seq_index[pos1]] * mask[pos, pos1]
j_correction -= reduced_j[pos1, aa1] * mask.mean()
j_correction += reduced_j[pos2, seq_index[pos2]] * mask[pos, pos2]
j_correction -= reduced_j[pos2, aa2] * mask.mean()
# J correction, interaction with self aminoacids
j_correction -= potts_model['J'][pos1, pos2, seq_index[pos1], seq_index[pos2]] * mask[pos1, pos2] # Taken two times
j_correction += potts_model['J'][pos1, pos2, aa1, seq_index[pos2]] * mask.mean() # Added mistakenly
j_correction += potts_model['J'][pos1, pos2, seq_index[pos1], aa2] * mask.mean() # Added mistakenly
j_correction -= potts_model['J'][pos1, pos2, aa1, aa2] * mask.mean() # Correct combination
decoy_energy += j_correction
decoy_energy2=np.zeros([seq_len,seq_len,21,21])
decoy_energy2[mask]=decoy_energy
return decoy_energy2
[docs]
def compute_decoy_energy(seq: str, potts_model: dict, mask: np.array, kind='singleresidue') -> np.array:
"""
Computes all possible decoy energies.
Parameters
----------
seq : str
The amino acid sequence of the protein. The sequence is assumed to be in one-letter code. Gaps are represented as '-'. The length of the sequence (L) should match the dimensions of the Potts model.
potts_model : dict
A dictionary containing the Potts model parameters 'h' (fields) and 'J' (couplings). The fields are a 2D array of shape (L, 20), where L is the length of the sequence and 20 is the number of amino acids. The couplings are a 4D array of shape (L, L, 20, 20). The fields and couplings are assumed to be in units of energy.
mask : np.array
A 2D Boolean array that determines which residue pairs should be considered in the energy computation. The mask should have dimensions (L, L), where L is the length of the sequence.
kind : str
Kind of decoys generated. Options: "singleresidue," "mutational," "configurational," and "contact."
Returns
-------
decoy_energy: np.array
Matrix describing all possible decoy energies.
Examples
--------
>>> seq = "ACDEFGHIKLMNPQRSTVWY"
>>> potts_model = {
'h': np.random.rand(20, 20), # Random fields
'J': np.random.rand(20, 20, 20, 20) # Random couplings
}
>>> mask = np.ones((len(seq), len(seq)), dtype=bool) # Include all pairs
>>> kind = "singleresidue"
>>> decoy_energy = compute_decoy_energy(seq, potts_model, mask, kind)
>>> print(f"Matrix of Single Residue Decoy Energo: "); print(decoy_energy2)
>>> print(f"Matrix Size: "); print(shape(decoy_energy2))
Notes
-----
The couplings energy is computed as the half-sum of the couplings for all pairs of residues
where the mask is True. The division by 2 for the couplings accounts for double-counting in symmetric
matrices.
.. todo:: Optimize the computation.
"""
native_energy = compute_native_energy(seq, potts_model, mask)
if kind == 'singleresidue':
decoy_energy=native_energy + compute_singleresidue_decoy_energy_fluctuation(seq, potts_model, mask)
elif kind == 'mutational':
decoy_energy=native_energy + compute_mutational_decoy_energy_fluctuation(seq, potts_model, mask)
elif kind == 'configurational':
decoy_energy=native_energy + compute_configurational_decoy_energy_fluctuation(seq, potts_model, mask)
elif kind == 'contact':
decoy_energy=native_energy + compute_contact_decoy_energy_fluctuation(seq, potts_model, mask)
return decoy_energy
[docs]
def compute_aa_freq(seq, include_gaps=True):
"""
Calculates amino acid frequencies in given sequence
Parameters
----------
seq : str
The amino acid sequence of the protein. The sequence is assumed to be in one-letter code. Gaps are represented as '-'.
include_gaps: bool
If True, frequencies of gaps ('-') in the sequence are set to 0.
Default is True.
Returns
-------
aa_freq: np.array
Array of frequencies of all 21 possible amino acids within sequence
"""
seq_index = np.array([_AA.find(aa) for aa in seq])
aa_freq = np.array([(seq_index == i).sum() for i in range(21)])
if not include_gaps:
aa_freq[0] = 0
return aa_freq
[docs]
def compute_single_frustration(decoy_fluctuation,
aa_freq=None,
correction=0):
"""
Calculates single residue frustration indices
Parameters
----------
decoy_fluctuation: np.array
(Lx21) matrix for a sequence of length L, describing the energetic changes upon mutating a single residue.
aa_freq: np.array
Array of frequencies of all 21 possible amino acids within sequence
Returns
-------
frustration: np.array
Array of length L featuring single residue frustration indices.
"""
if aa_freq is None:
aa_freq = np.ones(21)
mean_energy = (aa_freq * decoy_fluctuation).sum(axis=1) / aa_freq.sum()
std_energy = np.sqrt(
((aa_freq * (decoy_fluctuation - mean_energy[:, np.newaxis]) ** 2) / aa_freq.sum()).sum(axis=1))
frustration = -mean_energy / (std_energy + correction)
frustration *= -1
return frustration
[docs]
def compute_pair_frustration(decoy_fluctuation,
contact_freq: Union[None, np.array],
correction=0) -> np.array:
"""
Calculates pair residue frustration indices
Parameters
----------
decoy_fluctuation: np.array
(LxLx21x21) matrix for a sequence of length L, describing the energetic changes upon mutating two residues simultaneously.
contact_freq: np.array
21x21 array of frequencies of all possible contacts within sequence.
Returns
-------
contact_frustration: np.array
LxL array featuring pair frustration indices (mutational or configurational frustration, depending on
decoy_fluctuation matrix provided)
"""
if contact_freq is None:
contact_freq = np.ones([21, 21])
decoy_energy = decoy_fluctuation
seq_len = decoy_fluctuation.shape[0]
average = np.average(decoy_energy.reshape(seq_len * seq_len, 21 * 21), weights=contact_freq.flatten(), axis=-1)
variance = np.average((decoy_energy.reshape(seq_len * seq_len, 21 * 21) - average[:, np.newaxis]) ** 2,
weights=contact_freq.flatten(), axis=-1)
mean_energy = average.reshape(seq_len, seq_len)
std_energy = np.sqrt(variance).reshape(seq_len, seq_len)
contact_frustration = -mean_energy / (std_energy + correction)
contact_frustration *= -1
return contact_frustration
[docs]
def compute_scores(potts_model: dict) -> np.array:
"""
Computes contact scores based on the Frobenius norm
.. math::
CN[i,j] = \\frac{F[i,j] - F[i,:] * F[:,j}{F[:,:]}
Parameters
----------
potts_model : dict
Potts model containing the couplings in the "J" key
Returns
-------
corr_norm : np.array
Contact score matrix (N x N)
"""
j = potts_model['J']
n, _, __, q = j.shape
norm = np.linalg.norm(j.reshape(n * n, q * q), axis=1).reshape(n, n) # Frobenius norm
norm_mean = np.mean(norm, axis=0) / (n - 1) * n
norm_mean_all = np.mean(norm) / (n - 1) * n
corr_norm = norm - norm_mean[:, np.newaxis] * norm_mean[np.newaxis, :] / norm_mean_all
corr_norm[np.diag_indices(n)] = 0
corr_norm = np.mean([corr_norm, corr_norm.T], axis=0) # Symmetrize matrix
return corr_norm
[docs]
def compute_roc(scores, distance_matrix, cutoff):
"""
Computes Receiver Operating Characteristic (ROC) curve of
predicted and true contacts (identified from the distance matrix).
Parameters
----------
scores : np.array
Contact score matrix (N x N)
distance_matrix : np.array
LxL array for sequence of length L, describing distances between contacts
cutoff : float
Distance cutoff for contacts
Returns
-------
roc_score : np.array
Array containing lists of false and true positive rates
"""
scores = sdist.squareform(scores)
distance = sdist.squareform(distance_matrix)
results = np.array([np.array(scores), np.array(distance)])
results = results[:, results[0, :].argsort()[::-1]] # Sort results by score
if cutoff!= None:
contacts = results[1] <= cutoff
else:
contacts = results[1]>0
not_contacts = ~contacts
tpr = np.concatenate([[0], contacts.cumsum() / contacts.sum()])
fpr = np.concatenate([[0], not_contacts.cumsum() / not_contacts.sum()])
roc_score=np.array([fpr, tpr])
return roc_score
[docs]
def compute_auc(roc_score):
"""
Computes Area Under Curve (AUC) of calculated ROC distribution
Parameters
----------
roc_score : np.array
Array containing lists of false and true positive rates
Returns
-------
auc : float
AUC value
"""
fpr, tpr = roc
auc = np.sum(tpr[:-1] * (fpr[1:] - fpr[:-1]))
return auc
[docs]
def plot_roc(roc_score):
"""
Plot ROC distribution
Parameters
----------
roc_score : np.array
Array containing lists of false and true positive rates
"""
import matplotlib.pyplot as plt
plt.plot(roc[0], roc[1])
plt.xlabel('False positive rate (1-specificity)')
plt.ylabel('True positive rate (sensiticity)')
plt.suptitle('Receiver operating characteristic')
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.plot([0, 1], [0, 1], '--')
[docs]
def plot_singleresidue_decoy_energy(decoy_energy, native_energy, method='clustermap'):
"""
Plot comparison of single residue decoy energies, relative to the native energy
Parameters
----------
decoy_energy : np.array
Lx21 array of decoy energies
native_energy : float
Native energy value
method : str
Options: "clustermap", "heatmap"
"""
import seaborn as sns
if method=='clustermap':
f=sns.clustermap
elif method == 'heatmap':
f = sns.heatmap
g = f(decoy_energy, cmap='RdBu_r',
vmin=native_energy - decoy_energy.std() * 3,
vmax=native_energy + decoy_energy.std() * 3)
AA_dict = {str(i): _AA[i] for i in range(len(_AA))}
new_ticklabels = []
if method == 'clustermap':
ax_heatmap = g.ax_heatmap
else:
ax_heatmap = g.axes
for t in ax_heatmap.get_xticklabels():
t.set_text(AA_dict[t.get_text()])
new_ticklabels += [t]
ax_heatmap.set_xticklabels(new_ticklabels)
return g
[docs]
def write_tcl_script(pdb_file: Union[Path,str], chain: str, mask: np.array, distance_matrix: np.array, distance_cutoff: float, single_frustration: np.array,
pair_frustration: np.array, tcl_script: Union[Path, str] ='frustration.tcl',max_connections: int =None, movie_name: Union[Path, str] =None, still_image_name: Union[Path, str] =None) -> Union[Path, str]:
"""
Writes a tcl script that can be run with VMD to superimpose the frustration patterns onto the corresponding PDB structure.
Parameters
----------
pdb_file : Path or str
pdb file name
chain : str
Select chain from pdb
mask : np.array
A 2D Boolean array that determines which residue pairs should be considered in the energy computation. The mask should have dimensions (L, L), where L is the length of the sequence.
distance_matrix : np.array
LxL array for sequence of length L, describing distances between contacts
distance_cutoff : float
Maximum distance at which a contact occurs
single_frustration : np.array
Array containing single residue frustration index values
pair_frustration : np.array
Array containing pair (ex. configurational, mutational, contact) frustration index values
tcl_script : Path or str
Output tcl script file with static structure
max_connections : int
Maximum number of pair frustration values visualized in tcl file
movie_name : Path or str
Output movie file with rotating structure
still_image_name : Path or str
Output image file with still image
Returns
-------
tcl_script : Path or str
tcl script file
"""
fo = open(tcl_script, 'w+')
single_frustration = np.nan_to_num(single_frustration,nan=0,posinf=0,neginf=0)
pair_frustration = np.nan_to_num(pair_frustration,nan=0,posinf=0,neginf=0)
structure = prody.parsePDB(str(pdb_file))
selection = structure.select('protein', chain=chain)
residues = np.unique(selection.getResnums())
fo.write(f'[atomselect top all] set beta 0\n')
# Single residue frustration
for r, f in zip(residues, single_frustration):
# print(f)
fo.write(f'[atomselect top "chain {chain} and residue {int(r)}"] set beta {f}\n')
# Mutational frustration:
r1, r2 = np.meshgrid(residues, residues, indexing='ij')
sel_frustration = np.array([r1.ravel(), r2.ravel(), pair_frustration.ravel(),distance_matrix.ravel(), mask.ravel()]).T
#Filter with mask and distance
if distance_cutoff:
mask_dist=(sel_frustration[:, -2] <= distance_cutoff)
else:
mask_dist=np.ones(len(sel_frustration),dtype=bool)
sel_frustration = sel_frustration[mask_dist & (sel_frustration[:, -1] > 0)]
minimally_frustrated = sel_frustration[sel_frustration[:, 2] < -0.78]
#minimally_frustrated = sel_frustration[sel_frustration[:, 2] < -1.78]
sort_index = np.argsort(minimally_frustrated[:, 2])
minimally_frustrated = minimally_frustrated[sort_index]
if max_connections:
minimally_frustrated = minimally_frustrated[:max_connections]
fo.write('draw color green\n')
for (r1, r2, f, d ,m) in minimally_frustrated:
r1=int(r1)
r2=int(r2)
if abs(r1-r2) == 1: # don't draw interactions between residues adjacent in sequence
continue
pos1 = selection.select(f'resid {r1} and chain {chain} and (name CB or (resname GLY and name CA))').getCoords()[0]
pos2 = selection.select(f'resid {r2} and chain {chain} and (name CB or (resname GLY and name CA))').getCoords()[0]
distance = np.linalg.norm(pos1 - pos2)
if d > 9.5 or d < 3.5:
continue
fo.write(f'lassign [[atomselect top "resid {r1} and name CA and chain {chain}"] get {{x y z}}] pos1\n')
fo.write(f'lassign [[atomselect top "resid {r2} and name CA and chain {chain}"] get {{x y z}}] pos2\n')
if 3.5 <= distance <= 6.5:
fo.write(f'draw line $pos1 $pos2 style solid width 2\n')
else:
fo.write(f'draw line $pos1 $pos2 style dashed width 2\n')
frustrated = sel_frustration[sel_frustration[:, 2] > 1]
#frustrated = sel_frustration[sel_frustration[:, 2] > 0]
sort_index = np.argsort(frustrated[:, 2])[::-1]
frustrated = frustrated[sort_index]
if max_connections:
frustrated = frustrated[:max_connections]
fo.write('draw color red\n')
for (r1, r2, f ,d, m) in frustrated:
r1=int(r1)
r2=int(r2)
if d > 9.5 or d < 3.5:
continue
fo.write(f'lassign [[atomselect top "resid {r1} and name CA and chain {chain}"] get {{x y z}}] pos1\n')
fo.write(f'lassign [[atomselect top "resid {r2} and name CA and chain {chain}"] get {{x y z}}] pos2\n')
if 3.5 <= d <= 6.5:
fo.write(f'draw line $pos1 $pos2 style solid width 2\n')
else:
fo.write(f'draw line $pos1 $pos2 style dashed width 2\n')
fo.write('''mol delrep top 0
mol color Beta
mol representation NewCartoon 0.300000 10.000000 4.100000 0
mol selection all
mol material Opaque
mol addrep top
color scale method GWR
''')
if movie_name:
fo.write('''axes location Off
color Display Background white
display resize 800 800
display projection Orthographic
display depthcue off
display resetview
display resize [expr [lindex [display get size] 0]/2*2] [expr [lindex [display get size] 1]/2*2] ;#Resize display to even height and width
display update ui
# Set up the movie directory and base file name
mkdir movie_tmp
set workdir "movie_tmp"
''' + f'set basename "{movie_name}"' + '''
set numframes 360
set framerate 25
# Function to rotate the molecule and capture frames
proc captureFrames {} {
global workdir basename numframes
for {set i 0} {$i < $numframes} {incr i} {
# Rotate the molecule around the Y-axis
rotate y by 1
# Capture the frame
set output [format "%s/$basename.%05d.tga" $workdir $i]
render snapshot $output
}
}
# Function to convert frames to MP4
proc convertToMP4 {} {
global workdir basename numframes framerate
set mybasefilename [format "%s/%s" $workdir $basename]
set outputFile [format "%s.mp4" $basename]
# Construct and execute the ffmpeg command
set command "ffmpeg -y -framerate $framerate -i $mybasefilename.%05d.tga -c:v libx264 -profile:v high -crf 20 -pix_fmt yuv420p $outputFile"
puts "Executing: $command"
exec ffmpeg -y -framerate $framerate -i $mybasefilename.%05d.tga -c:v libx264 -profile:v high -crf 20 -pix_fmt yuv420p $outputFile >&@ stdout
}
# Main script execution
captureFrames
convertToMP4
# Cleanup the TGA files if desired
for {set i 0} {$i < $numframes} {incr i} {
set output [format "%s/$basename.%05d.tga" $workdir $i]
exec rm $output
}
exit
''')
elif still_image_name:
fo.write(f'set output "{still_image_name}"' + '''
render snapshot $output
exit
''')
fo.close()
return tcl_script
[docs]
def call_vmd(pdb_file: Union[Path,str], tcl_script: Union[Path,str]):
"""
Calls VMD program with given pdb file and tcl script to visualize frustration patterns
Parameters
----------
pdb_file : Path or str
pdb file name
tcl_script : Path or str
Output tcl script file with static structure
"""
import subprocess
return subprocess.Popen(['vmd', '-e', tcl_script, pdb_file], stdin=subprocess.PIPE)
[docs]
def canvas(with_attribution=True):
"""
Placeholder function to show example docstring (NumPy format).
Replace this function and doc string for your own project.
Parameters
----------
with_attribution : bool, Optional, default: True
Set whether or not to display who the quote is from.
Returns
-------
quote : str
Compiled string including quote and optional attribution.
"""
quote = "The code is but a canvas to our imagination."
if with_attribution:
quote += "\n\t- Adapted from Henry David Thoreau"
return quote
###
[docs]
def make_decoy_seqs(seq, ndecoys=1000):
"""
Creates permutated, decoy sequences using a given sequence residue composition and length.
Parameters
----------
seq : str
The amino acid sequence of the protein. The sequence is assumed to be in one-letter code. Gaps are represented as '-'. The length of the sequence (L) should match the dimensions of the Potts model.
n_decoys: int
Number of sequence decoys to create
Return
-------
decoy_seqs : list
List of decoy sequences. The sequences are assumed to be in one-letter code. Gaps are represented as '-'. The length of the sequences (L) should match the dimensions of the Potts model.
"""
seq_array = np.array(list(seq))
decoy_seqs = [''.join(np.random.permutation(seq_array)) for _ in range(ndecoys)]
return decoy_seqs
[docs]
def compute_fragment_mask(mask: np.array,
fragment_pos: np.array)-> np.array:
"""
Creates a mask for a sequence fragment such that:
- position i belongs to the fragment, all j
- position j belongs to the fragment, all i
The new mask consider all the interactions within the fragment and also the interactions between the fragment and other sequence positions.
Parameters
----------
mask : np.array
A 2D Boolean array that determines which residue pairs should be considered in the energy computation. The mask should have dimensions (L, L), where L is the length of the sequence.
fragment_pos: np:array
Array of sequence positions selected.
Return
-------
fragment_mask: np.array
New 2D Boolean array that determines which residue pairs should be considered in the energy computation. The mask should have dimensions (L, L), where L is the length of the sequence.
"""
custom_mask=np.zeros((mask.shape[0],mask.shape[0]),dtype=bool)
custom_mask[fragment_pos]=True
custom_mask[:,fragment_pos]=True
fragment_mask = custom_mask*mask
return fragment_mask
[docs]
def compute_fragment_total_native_energy(seq: str,
potts_model: dict,
mask: np.array,
fragment_pos : Union[None, np.array] = None,
fragment_in_context = False ) -> float:
"""
Calculates the energy for the complete protein or for a fragment in context
Parameters
----------
seq : str
The amino acid sequence of the protein. The sequence is assumed to be in one-letter code. Gaps are represented as '-'. The length of the sequence (L) should match the dimensions of the Potts model.
potts_model : dict
A dictionary containing the Potts model parameters 'h' (fields) and 'J' (couplings). The fields are a 2D array of shape (L, 20), where L is the length of the sequence and 20 is the number of amino acids. The couplings are a 4D array of shape (L, L, 20, 20). The fields and couplings are assumed to be in units of energy.
mask : np.array
A 2D Boolean array that determines which residue pairs should be considered in the energy computation. The mask should have dimensions (L, L), where L is the length of the sequence.
fragment_pos: np:array
Array of sequence positions selected.
fragment_in_context: bool
If True, the energetics calculations take into account the interactions between the fragment and other sequence positions
Return
-------
energy: float
Native energy of the protein
"""
seq_index = np.array([_AA.find(aa) for aa in seq])
seq_len = len(seq_index)
pos1, pos2 = np.meshgrid(np.arange(seq_len), np.arange(seq_len), indexing='ij', sparse=True)
aa1, aa2 = np.meshgrid(seq_index, seq_index, indexing='ij', sparse=True)
if len(potts_model['J'].shape)==4:
# print('potts')
h = -potts_model['h'][range(seq_len), seq_index]
j = -potts_model['J'][pos1, pos2, aa1, aa2]
else:
#MJ
# print('MJ')
h = 0
j = -potts_model['J'][aa1, aa2]
if fragment_in_context:
h_mask=np.zeros(seq_len,dtype=int)
h_mask[fragment_pos]=1
j_mask=compute_fragment_mask(mask,fragment_pos)
else:
h_mask = 1
j_mask = 1
h_prime= h*h_mask
j_prime = j * j_mask
energy = h_prime.sum() + j_prime.sum() / 2
return energy
[docs]
def compute_fragment_total_decoy_energy(decoy_seqs: list,
potts_model: dict,
mask: np.array,
fragment_pos : Union[None, np.array] = None,
fragment_in_context = False,
split_couplings_and_fields = False,
config_decoys = False,
msa_mask = 1) -> np.array:
"""
Calculates decoy energies for the complete protein or for a fragment in context
Parameters
----------
decoy_seqs : list
List of decoy sequences. The sequences are assumed to be in one-letter code. Gaps are represented as '-'. The length of the sequences (L) should match the dimensions of the Potts model.
potts_model : dict
A dictionary containing the Potts model parameters 'h' (fields) and 'J' (couplings). The fields are a 2D array of shape (L, 20), where L is the length of the sequence and 20 is the number of amino acids. The couplings are a 4D array of shape (L, L, 20, 20). The fields and couplings are assumed to be in units of energy.
mask : np.array
A 2D Boolean array that determines which residue pairs should be considered in the energy computation. The mask should have dimensions (L, L), where L is the length of the sequence.
fragment_pos: np:array
Array of sequence positions selected.
fragment_in_context: bool
If True, the energetics calculations take into account the interactions between the fragment and other sequence positions
split_couplings_and_fields: bool
Separate output into coupling and local fields contribution to energy.
config_decoys: bool
If True, use the configurational decoys approximation, shuffling index positions for configurational decoys energy calculation. If False, mutational decoys.
msa_mask: np.array
Extra mask to use a Multiple Sequence Alignment that do not cover completely the reference PDB
Return
-------
energy: np.array
Decoy energies
"""
seq_index = np.array([[_AA.find(aa) for aa in seq] for seq in decoy_seqs])
N_seqs, seq_len = seq_index.shape
pos_index=np.repeat([np.arange(seq_len)], N_seqs,axis=0)
if config_decoys:
pos_index=np.array([np.random.choice(pos_index[0],
size=len(pos_index[0]),
replace=False) for x in range(pos_index.shape[0])])
mask=np.ones(mask.shape)*mask.mean()
mask=mask*msa_mask
pos1=np.array([np.meshgrid(p, p, indexing='ij', sparse=True)[0] for p in pos_index])
pos2=np.array([np.meshgrid(p, p, indexing='ij', sparse=True)[1] for p in pos_index])
aa1=np.array([np.meshgrid(s, s, indexing='ij', sparse=True)[0] for s in seq_index])
aa2=np.array([np.meshgrid(s, s, indexing='ij', sparse=True)[1] for s in seq_index])
if len(potts_model['J'].shape)==4:
h = -potts_model['h'][pos_index,seq_index]
j = -potts_model['J'][pos1, pos2, aa1, aa2]
else:
#MJ
h = 0
j = -potts_model['J'][aa1, aa2]
if fragment_in_context:
h_mask=np.zeros(seq_len,dtype=int)
h_mask[fragment_pos]=1
j_mask=compute_fragment_mask(mask,fragment_pos)
else:
h_mask = 1
j_mask = 1
h_prime= h*h_mask
j_prime = j * j_mask
if split_couplings_and_fields:
return np.array([h_prime.sum(axis=-1),j_prime.sum(axis=-1).sum(axis=-1) / 2])
else:
energy = h_prime.sum(axis=-1) + j_prime.sum(axis=-1).sum(axis=-1) / 2
return energy
[docs]
def compute_total_frustration(seq,
potts_model,
mask,
ndecoys = 1000,
config_decoys = False,
msa_mask = 1,
fragment_pos = None,
fragment_in_context = False,
output_kind = 'frustration'):
"""
Calculates the total frustration of a protein fragment.
Parameters
----------
seq : str
The amino acid sequence of the protein. The sequence is assumed to be in one-letter code. Gaps are represented as '-'. The length of the sequence (L) should match the dimensions of the Potts model.
potts_model : dict
A dictionary containing the Potts model parameters 'h' (fields) and 'J' (couplings). The fields are a 2D array of shape (L, 20), where L is the length of the sequence and 20 is the number of amino acids. The couplings are a 4D array of shape (L, L, 20, 20). The fields and couplings are assumed to be in units of energy.
mask : np.array
A 2D Boolean array that determines which residue pairs should be considered in the energy computation. The mask should have dimensions (L, L), where L is the length of the sequence.
n_decoys: int
Number of sequence decoys to create
config_decoys: bool
If True, use the configurational decoys approximation, shuffling index positions for configurational decoys energy calculation. If False, mutational decoys.
msa_mask: np.array
Extra mask to use a Multiple Sequence Alignment that do not cover completely the reference PDB
fragment_pos: np.array
Fragment positions. If None, use the complete model
fragment_in_context: bool
If True, the energetics calculations take into account the interactions between the fragment and other sequence positions
output_kind: str
If 'frustration', returns frustration. If not, returns native energy, decoy energy average and decoy energy standard deviation.
Return
-------
total_frustration : float
Total frustration of the fragment or complete protein
native_energy: float
Native energy of the given sequence
decoy_energy_average: float
Average of the decoy energy distribution
decoy_energy_std: float
Standard deviation of the decoy energy distribution
"""
native_energy = compute_fragment_total_native_energy(seq,
potts_model,
mask,
fragment_pos,
fragment_in_context)
decoy_seqs = make_decoy_seqs(seq,ndecoys=ndecoys)
decoy_energies = compute_fragment_total_decoy_energy(decoy_seqs,
potts_model,
mask,
fragment_pos,
fragment_in_context,
config_decoys = config_decoys,
msa_mask = msa_mask)
decoy_energy_average = decoy_energies.mean()
decoy_energy_std = decoy_energies.std()
total_frustration = (native_energy - decoy_energy_average)/ decoy_energy_std
if output_kind == 'frustration':
return total_frustration
else:
return native_energy, decoy_energy_average, decoy_energy_std
[docs]
def compute_native_h_J(seq: str,
potts_model: dict,
mask: np.array) -> tuple:
"""
Computes the applied fields h_i(a_i) and J_{ij}(a_i,a_j) for the residues a_i of the sequence
Parameters
----------
seq : str
The amino acid sequence of the protein. The sequence is assumed to be in one-letter code. Gaps are represented as '-'. The length of the sequence (L) should match the dimensions of the Potts model.
potts_model : dict
A dictionary containing the Potts model parameters 'h' (fields) and 'J' (couplings). The fields are a 2D array of shape (L, 20), where L is the length of the sequence and 20 is the number of amino acids. The couplings are a 4D array of shape (L, L, 20, 20). The fields and couplings are assumed to be in units of energy.
mask : np.array
A 2D Boolean array that determines which residue pairs should be considered in the energy computation. The mask should have dimensions (L, L), where L is the length of the sequence.
Returns
-------
h: np.array
Values of the local field for the sequence.
j: np.array
Values of the coupling field for the sequence.
"""
seq_index = np.array([_AA.find(aa) for aa in seq])
seq_len = len(seq_index)
pos1, pos2 = np.meshgrid(np.arange(seq_len), np.arange(seq_len), indexing='ij', sparse=True)
aa1, aa2 = np.meshgrid(seq_index, seq_index, indexing='ij', sparse=True)
if len(potts_model['J'].shape)==4:
# print('potts')
h = -potts_model['h'][range(seq_len), seq_index]
j = -potts_model['J'][pos1, pos2, aa1, aa2]
else:
#MJ
# print('MJ')
h = 0
j = -potts_model['J'][aa1, aa2]
return h, j
[docs]
def compute_decoy_h_J(decoy_seqs: list,
potts_model: dict,
mask: np.array,
config_decoys: bool = False) -> tuple:
"""
Computes the applied fields h_i(a_i) and J_{ij}(a_i,a_j) for the residues a_i of a set of decoy sequences
Parameters
----------
decoy_seqs : list
List of decoy sequences. The sequences are assumed to be in one-letter code. Gaps are represented as '-'. The length of the sequences (L) should match the dimensions of the Potts model.
potts_model : dict
A dictionary containing the Potts model parameters 'h' (fields) and 'J' (couplings). The fields are a 2D array of shape (L, 20), where L is the length of the sequence and 20 is the number of amino acids. The couplings are a 4D array of shape (L, L, 20, 20). The fields and couplings are assumed to be in units of energy.
mask : np.array
A 2D Boolean array that determines which residue pairs should be considered in the energy computation. The mask should have dimensions (L, L), where L is the length of the sequence.
config_decoys: bool
If True, use the configurational decoys approximation, shuffling index positions for configurational decoys energy calculation. If False, mutational decoys.
Returns
-------
h: np.array
Values of the local field for each decoy sequence.
j: np.array
Values of the coupling field for each decoy sequence.
"""
seq_index = np.array([[_AA.find(aa) for aa in seq] for seq in decoy_seqs])
N_seqs, seq_len = seq_index.shape
pos_index=np.repeat([np.arange(seq_len)], N_seqs,axis=0)
if config_decoys:
pos_index=np.array([np.random.choice(pos_index[0],
size=len(pos_index[0]),
replace=False) for x in range(pos_index.shape[0])])
mask=np.ones(mask.shape)*mask.mean()
pos1=np.array([np.meshgrid(p, p, indexing='ij', sparse=True)[0] for p in pos_index])
pos2=np.array([np.meshgrid(p, p, indexing='ij', sparse=True)[1] for p in pos_index])
aa1=np.array([np.meshgrid(s, s, indexing='ij', sparse=True)[0] for s in seq_index])
aa2=np.array([np.meshgrid(s, s, indexing='ij', sparse=True)[1] for s in seq_index])
if len(potts_model['J'].shape)==4:
h = -potts_model['h'][pos_index,seq_index]
j = -potts_model['J'][pos1, pos2, aa1, aa2]
else:
#MJ
h = 0
j = -potts_model['J'][aa1, aa2]
return h,j
[docs]
def compute_native_fragment_energy_from_h_j(fragment_pos: np.array,
h: np.array,
j: np.array,
mask: np.array)-> float:
"""
Computes the energy from the applied fields h_i(a_i) and J_{ij}(a_i,a_j)
Parameters
----------
fragment_pos: np:array
Array of sequence positions selected.
h: np.array
Values of the local field for each decoy sequence.
j: np.array
Values of the coupling field for each decoy sequence.
mask : np.array
A 2D Boolean array that determines which residue pairs should be considered in the energy computation. The mask should have dimensions (L, L), where L is the length of the sequence.
Returns
-------
energy: float
Native energy of the protein
"""
h_mask=np.zeros(len(h),dtype=int)
h_mask[fragment_pos]=1
j_mask=compute_fragment_mask(mask,fragment_pos)
h_prime= h*h_mask
j_prime = j * j_mask
energy = h_prime.sum() + j_prime.sum() / 2
return energy
[docs]
def compute_decoy_fragment_energy_from_h_j(fragment_pos: np.array,
h: np.array,
j: np.array,
mask: np.array)-> tuple:
"""
Computes the energy from the applied fields h_i(a_i) and J_{ij}(a_i,a_j) for each decoy sequence.
Parameters
----------
fragment_pos: np:array
Array of sequence positions selected.
h: np.array
Values of the local field for each decoy sequence.
j: np.array
Values of the coupling field for each decoy sequence.
mask : np.array
A 2D Boolean array that determines which residue pairs should be considered in the energy computation. The mask should have dimensions (L, L), where L is the length of the sequence.
Returns
-------
energy_average: float
Average of the decoy energies
energy_std: float
Standard deviation of the decoy energies
"""
h_mask=np.zeros(h.shape[1],dtype=int)
h_mask[fragment_pos]=1
j_mask=compute_fragment_mask(mask,fragment_pos)
h_prime= h*h_mask
j_prime = j * j_mask
energy = h_prime.sum(axis=-1) + j_prime.sum(axis=-1).sum(axis=-1) / 2
energy_average = energy.mean()
energy_std = energy.std()
return energy_average, energy_std
[docs]
def compute_energy_sliding_window(seq: str,
potts_model: dict,
mask: np.array,
win_size: int,
ndecoys: int,
config_decoys: bool) -> dict:
"""
Computes the total frustration, the native energy, the decoy average energy and the decoy standard deviation for fragments on a sliding window
Parameters
----------
seq : str
The amino acid sequence of the protein. The sequence is assumed to be in one-letter code. Gaps are represented as '-'. The length of the sequence (L) should match the dimensions of the Potts model.
potts_model : dict
A dictionary containing the Potts model parameters 'h' (fields) and 'J' (couplings). The fields are a 2D array of shape (L, 20), where L is the length of the sequence and 20 is the number of amino acids. The couplings are a 4D array of shape (L, L, 20, 20). The fields and couplings are assumed to be in units of energy.
mask : np.array
A 2D Boolean array that determines which residue pairs should be considered in the energy computation. The mask should have dimensions (L, L), where L is the length of the sequence.
win_size: int
Size of the sliding window
ndecoys: int
Number of decoy sequences to use
config_decoys: bool
If True, use the configurational decoys approximation, shuffling index positions for configurational decoys energy calculation. If False, mutational decoys.
Returns
-------
results: dict
Dictionary with the results, containing
'fragment_center': center position of each window
'win_size': size of the sliding windows
'native_energy': native energy for each window
'decoy_energy_av': decoy energy average for each window
'decoy_energy_std': decoy energy standard deviation for each window
'frustration': total frustration index for each window
"""
h, j = compute_native_h_J(seq, potts_model, mask)
decoy_seqs = make_decoy_seqs(seq, ndecoys=ndecoys)
h_decoys,j_decoys = compute_decoy_h_J(decoy_seqs, potts_model, mask, config_decoys)
dif = (win_size - 1) // 2
positions = np.arange(dif, len(seq) - dif)
e_native, e_decoy_av, e_decoy_std, frustration_sw = [], [], [], []
for i in positions:
fragment_pos = np.arange(i - dif, i + dif + 1)
native_energy = compute_native_fragment_energy_from_h_j(fragment_pos,h,j,mask)
decoy_avg, decoy_std = compute_decoy_fragment_energy_from_h_j(fragment_pos,
h_decoys,
j_decoys,
mask)
frustration_score = (native_energy - decoy_avg) / decoy_std if decoy_std != 0 else 0
e_native.append(native_energy)
e_decoy_av.append(decoy_avg)
e_decoy_std.append(decoy_std)
frustration_sw.append(frustration_score)
results = {
'fragment_center': positions,
'win_size': [win_size] * len(positions),
'native_energy': e_native,
'decoy_energy_av': e_decoy_av,
'decoy_energy_std': e_decoy_std,
'frustration': frustration_sw
}
return results