from Bio import AlignIO
import numpy as np
from pathlib import Path
from typing import Union
import logging
logger = logging.getLogger(__name__)
[docs]
def filter_alignment(alignment_file: Union[Path,str],
output_file:str,
alignment_format:str = "stockholm")->Path:
"""
Filter stockholm alignment by removing unaligned sections.
Filters by saving the alignment into memory.
Parameters
----------
alignment_file : Path or str
Alignment file name
output_file : str
Output file name
alignment_format : str
Biopython alignment format (Default: stockholm)
Returns
-------
output_file : Path
location of the output_file
"""
filter_query=False
# Parse the alignment
alignment = AlignIO.read(alignment_file, alignment_format)
# Create a numpy array of the alignment
alignment_array=[]
for record in alignment:
alignment_array+=[np.array(record.seq)]
if 'Query' in record.name:
logging.debug(f'Query sequence found: >{record.name}')
query_seq=np.array(record.seq)
filter_query=True
alignment_array=np.array(alignment_array)
# If there is a query sequence only take sequences in query
if filter_query:
alignment_array=alignment_array[:,query_seq!='-']
# Substitute lower case letters with gaps
for letter in np.unique(alignment_array):
if letter!=letter.upper():
alignment_array[alignment_array==letter]='-'
# Take only columns that are not all gaps
not_all_gaps=(alignment_array=='-').sum(axis=0)<len(alignment_array)
new_alignment=alignment_array[:,not_all_gaps]
output_file = Path(output_file)
logging.debug(f'{"".join(new_alignment[0])}')
# Write filtered alignment to file
text=''
for record,new_seq in zip(alignment,new_alignment):
text+=f">{record.id}\n{''.join(new_seq)}\n"
output_file.write_text(text)
return output_file
[docs]
def filter_alignment_lowmem(alignment_file:str,
output_file:str,
alignment_format:str = "stockholm")->Path:
"""
Filter stockholm alignment by removing unaligned sections.
Filters by reading the file without saving to memory.
Parameters
----------
alignment_file : str
location of file
output_file : str
location of the output_file
alignment_format : str
Biopython alignment format (Default: stockholm)
Returns
-------
output_file : str
location of the output_file
"""
#TODO: Implement filter query for jackhmmer
# Remove inserts and columns that are completely composed of gaps from MSA
alignment = AlignIO.parse(alignment_file, "stockholm")
output_handle = open(output_file, "w")
with open(alignment_file) as alignment_handle:
alignment = AlignIO.read(alignment_handle, "stockholm")
index_mask = []
for i, record in enumerate(alignment):
index_mask += [i for i, x in enumerate(list(record.seq)) if x != x.upper()]
for i in range(len(alignment[0].seq)):
if alignment[:, i] == ''.join(["-"] * len(alignment)):
index_mask.append(i)
index_mask = sorted(list(set(index_mask)))
with open(alignment_file) as alignment_handle, open(output_file, "w") as output_handle:
alignment = AlignIO.read(alignment_handle, "stockholm")
for i, record in enumerate(alignment):
aligned_sequence = [list(record.seq)[i] for i in range(len(list(record.seq))) if i not in index_mask]
output_handle.write(">%s\n" % record.id + "".join(aligned_sequence) + '\n')
return Path(output_file)