Source code for frustratometer.classes.Structure

from .. import pdb
import prody
import os
import Bio.PDB.Polypeptide as poly
import numpy as np
from typing import Union
from pathlib import Path
import warnings

__all__ = ['Structure']

residue_names=[]

[docs] class Structure: def __init__(self, pdb_file: Union[Path,str], chain: Union[str,None]=None, seq_selection: str = None, aligned_sequence: str = None, filtered_aligned_sequence: str = None, distance_matrix_method:str = 'CB', pdb_directory: Path = Path.cwd(), repair_pdb:bool = True)->object: """ Generates structure object. Both PDB and CIF format files are accepted as input. Parameters ---------- pdb_file : str PDB file path chain : str PDB chain name. If "chain=None", all chains will be included. seq_selection: str Subsequence selection command, using Prody select module. *If wanting to use original PDB indexing, set seq_selection as: "resnum `{initial_index}to{final_index}`" *If wanting to use absolute PDB indexing (first residue has index=0), set seq_selection as: "resindex `{initial_index}to{final_index}`" Note that using "to" will create a substructure including both the initial and final designated residue. If you would like to not include the final desginated residue, replace "to" with ":" Note that the algorithm will account for any gaps in the PDB and adjust the provided residue range accordingly. distance_matrix_method: str The method to use for calculating the distance matrix. Defaults to 'CB', which uses the CB atom for all residues except GLY, which uses the CA atom. Other options are 'CA' for using only the CA atom, and 'minimum' for using the minimum distance between all atoms in each residue. pdb_directory: str Directory where repaired pdb will be downloaded repair_pdb: bool If True, provided pdb file will be repaired with missing residues inserted and heteroatoms removed. Note that a pdb file will be produced, regardless of input file format. Returns ------- Structure object """ try: #Check if file exists pdb_file=Path(pdb_file) assert pdb_file.exists() except AssertionError: #Attempt to download pdb file pdb_file=str(pdb_file) if len(pdb_file)==4: self.pdbID=pdb_file print(f"Downloading {self.pdbID} from the PDB") pdb_file=pdb.download(self.pdbID, pdb_directory) else: raise FileNotFoundError(f"Provided file {pdb_file} does not exist") self.pdbID=pdb_file.stem self.pdb_file=pdb_file self.chain=chain self.distance_matrix_method=distance_matrix_method self.filtered_aligned_sequence=filtered_aligned_sequence self.aligned_sequence=aligned_sequence self.seq_selection=seq_selection if self.seq_selection==None: self.init_index_shift=0 if repair_pdb: fixer=pdb.repair_pdb(pdb_file, chain, pdb_directory) self.pdb_file=str(pdb_directory/f"{self.pdbID}_cleaned.pdb") if ".pdb" in str(pdb_file) or repair_pdb==True: self.structure = prody.parsePDB(str(self.pdb_file), chain=self.chain).select(f"protein") else: self.structure=prody.parseMMCIF(str(self.pdb_file),chain=self.chain).select(f"protein") else: assert len(self.seq_selection.replace("to"," to ").replace(":"," : ").split())>=4, "Please correctly input your residue selection" if self.chain==None: raise ValueError("Please provide a chain name") self.init_index=int(self.seq_selection.replace("to"," to ").replace(":"," : ").split()[1].replace("`","")) self.fin_index=int(self.seq_selection.replace("to"," to ").replace(":"," : ").split()[3].replace("`","")) #Account for pdbs that have starting indices greater than one and find any gaps in the pdb. gap_indices=[]; atom_line_count=0 if ".cif" in str(pdb_file): extension="cif" shift=2;index_shift=3 else: extension="pdb" shift=0;index_shift=0 with open(pdb_file,"r") as f: for line in f: if line.split()[0]=="ATOM" and line.split()[4+shift]==self.chain: try: res_index=''.join(i for i in line.split()[5+index_shift] if i.isdigit()) next_res_index=''.join(i for i in next(f).split()[5+index_shift] if i.isdigit()) if int(next_res_index)-int(res_index)>1: gap_indices.extend(list(range(int(res_index)+1,int(next_res_index)))) if atom_line_count==0 and poly.is_aa(line.split()[3+shift]): self.pdb_init_index=int(line.split()[5+index_shift]) atom_line_count+=1 except: continue if "resnum" in self.seq_selection: assert self.init_index>=self.pdb_init_index, "Please pick an initial index within the pdb's original indices" self.init_index_shift=self.init_index-self.pdb_init_index self.fin_index_shift=self.fin_index-self.pdb_init_index+1 if repair_pdb: fixer=pdb.repair_pdb(pdb_file, chain, pdb_directory) self.pdb_file=f"{pdb_directory}/{self.pdbID}_cleaned.pdb" self.select_gap_indices=[i for i in gap_indices if self.init_index<=i<=self.fin_index] self.fin_index_shift-=len(self.select_gap_indices) self.seq_selection=f"resnum `{self.init_index_shift+1}to{self.fin_index_shift}`" elif "resindex" in self.seq_selection: self.init_index_shift=self.init_index self.fin_index_shift=self.fin_index+1 if repair_pdb: fixer=pdb.repair_pdb(pdb_file, chain, pdb_directory) self.pdb_file=f"{pdb_directory}/{self.pdbID}_cleaned.pdb" self.chain="A" if ".pdb" in str(pdb_file) or repair_pdb==True: self.structure = prody.parsePDB(str(self.pdb_file), chain=self.chain).select(f"protein and {self.seq_selection}") else: self.structure=prody.parseMMCIF(str(self.pdb_file),chain=self.chain).select(f"protein and {self.seq_selection}") self.sequence=pdb.get_sequence(self.pdb_file,self.chain) self.distance_matrix=pdb.get_distance_matrix(pdb_file=self.pdb_file,chain=self.chain, method=self.distance_matrix_method) self.full_pdb_distance_matrix=self.distance_matrix self.z_coordinates=self.structure.select('((name CB) or (resname GLY and name CA))').getCoords() if self.seq_selection!=None: self.distance_matrix=self.distance_matrix[self.init_index_shift:self.fin_index_shift, self.init_index_shift:self.fin_index_shift] self.sequence=self.sequence[self.init_index_shift:self.fin_index_shift] if self.aligned_sequence is not None: self.full_to_aligned_index_dict=pdb.full_to_filtered_aligned_mapping(self.aligned_sequence,self.filtered_aligned_sequence) self.mapped_distance_matrix=np.full((len(self.filtered_aligned_sequence), len(self.filtered_aligned_sequence)), np.inf) pos1, pos2 = np.meshgrid(list(self.full_to_aligned_index_dict.keys()), list(self.full_to_aligned_index_dict.keys()), indexing='ij', sparse=True) modpos1, modpos2 = np.meshgrid(list(self.full_to_aligned_index_dict.values()), list(self.full_to_aligned_index_dict.values()), indexing='ij', sparse=True) self.mapped_distance_matrix[modpos1,modpos2]=self.distance_matrix[pos1,pos2] np.fill_diagonal(self.mapped_distance_matrix, 0) else: if self.seq_selection==None: self.full_to_aligned_index_dict=dict(zip(range(len(self.sequence)), range(len(self.sequence)))) self.mapped_distance_matrix=self.distance_matrix else: self.full_to_aligned_index_dict=dict(zip(range(self.init_index_shift,self.fin_index_shift+1), range(len(self.sequence)))) self.mapped_distance_matrix=self.distance_matrix
[docs] @classmethod def full_pdb(cls,pdb_file: Union[Path,str], chain: Union[str,None]=None, aligned_sequence: str = None, filtered_aligned_sequence: str = None, distance_matrix_method:str = 'CB', pdb_directory: Path = Path.cwd(), repair_pdb:bool = True): warnings.warn("The class method 'full_pdb' is now depreciated. You can now simply call the Structure class to create a full pdb or spliced pdb object.") return cls(pdb_file=pdb_file, chain=chain, aligned_sequence=aligned_sequence, filtered_aligned_sequence=filtered_aligned_sequence, distance_matrix_method=distance_matrix_method, pdb_directory=pdb_directory, repair_pdb=repair_pdb)
[docs] @classmethod def spliced_pdb(cls,pdb_file: Union[Path,str], chain: Union[str,None]=None, seq_selection: str = None, aligned_sequence: str = None, filtered_aligned_sequence: str = None, distance_matrix_method:str = 'CB', pdb_directory: Path = Path.cwd(), repair_pdb:bool = True): warnings.warn("The class method 'spliced_pdb' is now depreciated. You can now simply call the Structure class to create a full pdb or spliced pdb object.") return cls(pdb_file=pdb_file, chain=chain, seq_selection=seq_selection, aligned_sequence=aligned_sequence, filtered_aligned_sequence=filtered_aligned_sequence, distance_matrix_method=distance_matrix_method, pdb_directory=pdb_directory, repair_pdb=repair_pdb,)
# @property # def sequence(self): # return self._sequence # # Set a new sequence in case someone needs to calculate the energy of a diferent sequence with the same structure # @sequence.setter # def sequence(self, value :str): # assert len(value) == len(self._sequence) # self._sequence = value # @property # def pdb_file(self): # return str(self._pdb_file) # @pdb_file.setter # def pdb_file(self,value: str): # self._pdb_file=value # @property # def chain(self): # return self._chain # @chain.setter # def chain(self,value): # self._chain=value # @property # def distance_matrix_method(self): # return self._distance_matrix_method # @distance_matrix_method.setter # def distance_matrix_method(self,value): # self._distance_matrix_method = value # @property # def init_index(self): # return self._init_index # @init_index.setter # def init_index(self,value): # self._init_index = value # @property # def fin_index(self): # return self._fin_index # @fin_index.setter # def fin_index(self,value): # self._fin_index = value