Source code for frustratometer.pdb.pdb

from Bio.PDB import PDBParser,MMCIFParser
import prody
import scipy.spatial.distance as sdist
import pandas as pd
import numpy as np
import itertools
import os
from pathlib import Path
from typing import Union

three_to_one = {'ALA':'A', 'ARG':'R', 'ASN':'N', 'ASP':'D', 'CYS':'C',
                'GLU':'E', 'GLN':'Q', 'GLY':'G', 'HIS':'H', 'ILE':'I',
                'LEU':'L', 'LYS':'K', 'MET':'M', 'PHE':'F', 'PRO':'P',
                'SER':'S', 'THR':'T', 'TRP':'W', 'TYR':'Y', 'VAL':'V'}



[docs]
def download(pdbID: str,directory: Union[Path,str]=Path.cwd()) -> Path:
    """
    Downloads a single pdb file

    Parameters
    ----------
    pdbID: str,
        PDB ID
    directory: Path or str,
        Directory where PDB file will be downloaded.

    Returns
    -------
    pdb_file : Path
        PDB file location.
    """

    import urllib.request
    pdb_file=Path(directory) / f'{pdbID}.pdb'
    urllib.request.urlretrieve('http://www.rcsb.org/pdb/files/%s.pdb' % pdbID, pdb_file)
    return pdb_file



[docs]
def get_sequence(pdb_file: str, 
                 chain: str
                 ) -> str:
    """
    Get a protein sequence from a pdb file

    Parameters
    ----------
    pdb_file : str,
        PDB file location.
    chain: str,
        Chain ID of the selected protein.

    Returns
    -------
    sequence : str
        Protein sequence.
    """
    """
    Get a protein sequence from a PDB file
    
    :param pdb: PDB file location
    :param chain: chain name of PDB file to get sequence
    :return: protein sequence
    """

    if ".cif" in str(pdb_file):
        parser = MMCIFParser()
    else:
        parser = PDBParser()
    structure = parser.get_structure('name', pdb_file)
    if chain==None:
        all_chains=[i.get_id() for i in structure.get_chains()]
    else:
        all_chains=[chain]
    sequence = ""
    for chain in all_chains:
        c = structure[0][chain]
        chain_seq = ""
        for residue in c:
            is_regular_res = residue.has_id('CA') and residue.has_id('O')
            res_id = residue.get_id()[0]
            if (res_id==' ' or res_id=='H_MSE' or res_id=='H_M3L' or res_id=='H_CAS') and is_regular_res:
                residue_name = residue.get_resname()
                chain_seq += three_to_one[residue_name]
        sequence += chain_seq
    return sequence





[docs]
def get_distance_matrix(pdb_file: Union[Path,str],
                        chain: str,
                        method: str = 'CB'
                        ) -> np.array:
    """
    Calculate the distance matrix of the specified atoms in a PDB file.

    Parameters
    ----------
    pdb_file: Path or str 
        The path to the PDB file.
    chain: str
        The chainID or chainIDs (space separated) of the protein.
    method: str
        The method to use for calculating the distance matrix. 
        Defaults to 'CB', which uses the CB atom for all residues except GLY, which uses the CA atom. 
        Options:
            'CA' for using only the CA atom,
            'minimum' for using the minimum distance between all atoms in each residue,
            'CB_force' computes a new coordinate for the CB atom based on the CA, C, and N atoms and uses CB distance even for glycine.

    Returns:
        np.array: The distance matrix of the selected atoms.

    Raises:
        IndexError: If the selection of atoms is empty.
        ValueError: If the method is not recognized.
    """

    structure = prody.parsePDB(str(pdb_file))
    chain_selection = '' if chain is None else f' and chain {chain}'

    if method == 'CA':
        coords = structure.select('protein and name CA' + chain_selection).getCoords()
    elif method == 'CB':
        coords = structure.select('(protein and (name CB) or (resname GLY and name CA))' + chain_selection).getCoords()
    elif method == 'minimum':
        selection = structure.select('protein' + chain_selection)
        coords = selection.getCoords()
        distance_matrix = sdist.squareform(sdist.pdist(coords))
        resids = selection.getResindices()
        residues = pd.Series(resids).unique()
        selections = np.array([resids == a for a in residues])
        dm = np.zeros((len(residues), len(residues)))
        for i, j in itertools.combinations(range(len(residues)), 2):
            d = distance_matrix[selections[i]][:, selections[j]].min()
            dm[i, j] = d
            dm[j, i] = d
        return dm
    elif method == 'CB_force':
        sel_CA = structure.select('name CA')
        sel_N = structure.select('name N')
        sel_C = structure.select('name C')
        
        # Base vectors
        vector_CA_C=sel_C.getCoords() - sel_CA.getCoords()
        vector_CA_N=sel_N.getCoords() - sel_CA.getCoords()

        vector_CA_C/=np.linalg.norm(vector_CA_C,axis=1,keepdims=True)
        vector_CA_N/=np.linalg.norm(vector_CA_N,axis=1,keepdims=True)

        #First Ortogonal vector
        cross_CA_C_CA_N=np.cross(vector_CA_C,vector_CA_N)
        cross_CA_C_CA_N/=np.linalg.norm(cross_CA_C_CA_N,axis=1,keepdims=True)

        #Second Ortogonal vector
        cross_cross_CA_N=np.cross(cross_CA_C_CA_N,vector_CA_N)
        cross_cross_CA_N/= np.linalg.norm(cross_cross_CA_N,axis=1,keepdims=True)
        
        #Precomputed CB coordinates
        coords = -0.531020*vector_CA_N-1.206181*cross_CA_C_CA_N+0.789162*cross_cross_CA_N+sel_CA.getCoords()
    else:
        raise ValueError(f"Invalid method '{method}'. Accepted methods are 'CA', 'CB', 'minimum', and 'CB_force'.")

    if len(coords) == 0:
        raise IndexError('Empty selection for distance map')

    distance_matrix = sdist.squareform(sdist.pdist(coords))
    return distance_matrix




[docs]
def full_to_filtered_aligned_mapping(aligned_sequence: str,
                                    filtered_aligned_sequence: str)->dict:

    """
    Get a dictionary mapping residue positions in the full pdb sequence to the aligned pdb sequence

    Parameters
    ----------
    aligned_sequence : str,
        Raw aligned PDB sequence.
    filtered_aligned_sequence: str,
        Filtered aligned PDB sequence (columns with insertions and deletions, i.e. dashes, that are
        typically filtered in MSA file processing are removed)

    Returns
    -------
    full_to_aligned_index_dict : dict
        Dictionary
    """
    full_to_aligned_index_dict={}; counter=0
    for i,x in enumerate(aligned_sequence):
        if x != "-" and x==x.upper():
            full_to_aligned_index_dict[counter]=i
        if x!="-":
            counter+=1

    dash_indices=[i for i,x in enumerate(filtered_aligned_sequence) if x!="-"]
    counter=0
    for entry in full_to_aligned_index_dict:
        full_to_aligned_index_dict[entry]=dash_indices[counter]
        counter+=1

    return full_to_aligned_index_dict