Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,8 @@
**/*.swp
**/*.pyc
**/.vscode

.eggs/
build/
dist/
pyali.egg-info/
16 changes: 14 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,14 @@
# pyali
A library for manipulating multiple sequence alignments (bioinformatics, structural biology)
# pyali
A library for manipulating multiple sequence alignments (bioinformatics, structural biology)

# Quickstart
We recommend you install `pyali` into its own virtual environment.

For example, if you use `pyenv`, you would execute the following commands:
* `pyenv virtualenv 3.8.2 pyali-3.8.2`
* `pyenv activate pyali-3.8.2`
* `python setup.py install` OR `pip install .`

You may also install `pyali` into an existing virtual environment for another project:
* `pyenv activate YOUR-PROJECT`
* `python setup.py install` OR `pip install .`
4 changes: 4 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[pytest]
markers =
mrgali: tests for code written by Chris Tang
scripts: tests for code written by Antoniya Aleksandrova
15 changes: 15 additions & 0 deletions scripts/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Working with FASTA files
The scripts here use **mrgali** (pyali) but can pre- and post-process files to make it easier to work directly with fasta files.

`python raise_aln.py -in <fasta file> -out <output file> -seq <sequence name>`

will move the alignment specified by the -seq tag to the top of a fasta file.


`python align_merger.py -in <list with fasta files> -out <output file> -width <width of lines in output fasta> -ref <name of reference sequence>`

takes a list of alignment fasta files that share one common sequence (-ref) and merges them into a single multiple sequence alignment. See *tests/examples* for an example.

Tests can be used by running `pytest -v`.

If you're using the scripts separately from the repo, make sure that you've installed *pyali* and that you've changed the dependency, as specified, in *align_merger.py*
127 changes: 127 additions & 0 deletions scripts/align_merger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
# Author: Antoniya A. Aleksandrova
# Language: Python 3
# Description: Takes pairwise alignments and outputs a multiple alignment
# Usage: python align_merger.py -in <list with fasta files> -out <output file> -width <width of lines in output fasta> -ref <name of reference sequence>

import argparse
import glob
import os
import sys

try:
from pyali.mrgali import Alignment
except:
print("[ERROR]: pyali has not been installed. To install, run `python setup.py install` from project directory.")
from raise_aln import raise_seq


def align_merger(file_list, outname, width, reference_seq):
refs, alis, alis_names = [], [], []
for f in file_list:
if reference_seq != '':
raise_seq(f, f + '.tmp', reference_seq)
alignment = open(f + '.tmp', 'r')
else:
alignment = open(f, 'r')
flag = 0
sequence1 = ""
sequence2 = ""
alis_element = []
for line in alignment:
if line.startswith(">") and flag == 2:
alis_element.append(sequence2)
alis_names.append(name2)
sequence2 = ""
name2 = line.strip()
if line.startswith(">") and flag == 1:
flag = 2
alis_element.append(sequence1)
name2 = line.strip()
if line.startswith(">") and flag == 0:
flag = 1
name1 = line.strip()
if not line.startswith(">") and flag == 1:
sequence1 = sequence1 + line.strip().upper()
if not line.startswith(">") and flag == 2:
sequence2 = sequence2 + line.strip().upper()
alignment.close()
if reference_seq != '':
os.remove(f + '.tmp')
alis_element.append(sequence2)
alis.append(alis_element)
alis_names.append(name2)

refs = [''.join([s for s in seqs[0] if s != '-']) for seqs in alis]
if refs.count(refs[0]) != len(refs):
print("The reference sequences in all the provided alignments are not identical.")
for i, r in enumerate(refs[1:]):
for j, s in enumerate(refs[0]):
if s!=refs[i+1][j]:
print(file_list[0] +": (" + str(j) + "," + s + "), " + file_list[i+1] + ": " + r[j])
raise SystemExit("References need to be the same to proceed.")

a = Alignment.from_reference(refs)
for i in range(len(alis)):
a.merge(i, alis[i])

flds = str(a).split('\n')

aligned_list = []
out = open(outname, 'w')
for i, ln in enumerate(flds):
if i == 0:
s = ln[ln.index(':') + 2:]
out.write(name1 + '\n')
aligned_list.append((name1, s))
while len(s) > 0:
out.write(s[:width] + '\n')
s = s[width:]
if i >= len(refs):
s = ln[ln.index(':') + 2:]
out.write(alis_names[i - len(refs)] + '\n')
aligned_list.append((alis_names[i - len(refs)], s))
while len(s) > 0:
out.write(s[:width] + '\n')
s = s[width:]
out.close()
return aligned_list


if __name__ == "__main__":

# Remove previously generated merged alignments
if os.path.isfile('merged_alignments.fasta'):
os.remove('merged_alignments.fasta')

# Parse user input and set defaults
parser = argparse.ArgumentParser()

parser.add_argument('-in', '--list_of_fasta_files', nargs='?') # file with a list of fasta files (one per line)
parser.add_argument('-out', '--out', nargs='?') # output file
parser.add_argument('-width', '--width', nargs='?') # fasta line width
parser.add_argument('-ref', '--reference', nargs='?') # name of the reference sequence

parser.set_defaults(list_of_fasta_files=glob.glob('*.fasta'))
parser.set_defaults(out='merged_alignments.fasta')
parser.set_defaults(width='72')
parser.set_defaults(reference='')

parsed = parser.parse_args()
if type(parsed.__dict__['list_of_fasta_files']) == list:
file_list = parsed.__dict__['list_of_fasta_files']
print("[INFO]: Pairwise alignments will be taken from all fasta files in current directory.")
else:
with open(parsed.__dict__['list_of_fasta_files'], 'r') as f:
file_list = f.read().strip().split('\n')
print("[INFO]: Pairwise alignment files read from " + parsed.__dict__['list_of_fasta_files'] + ".")
print("[INFO]: The following files were read: " + ', '.join(file_list))
if len(file_list) == 0:
raise SystemExit("No fasta files available for alignment.")
outname = parsed.__dict__['out']
print("[INFO]: Merged alignment will be written to " + outname + ".")
width = int(parsed.__dict__['width'])
ref_seq = parsed.__dict__['reference']

align_merger(file_list, outname, width, ref_seq)

print("[INFO]: Done.")
60 changes: 60 additions & 0 deletions scripts/raise_aln.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Author: Antoniya A. Aleksandrova
# Language: Python 3
# Description: Move a specified alignment to the top of a fasta file
# Usage: python raise_aln.py -in <fasta file> -out <output file> -seq <sequence name>

import os
import sys
import argparse

def raise_seq(infile, outfile, seqn):
aligns = []
name = ""
seq = ""
if '>' not in seqn:
seqn = '>' + seqn
f = open(infile, 'r')
for line in f:
if line.startswith(">"):
if seq != "":
aligns.append((name, seq))
name = line.strip()
seq = ""
elif not line.startswith("#"):
seq = seq + line
aligns.append((name, seq))
f.close()

index = [x for x, y in enumerate(aligns) if y[0] == seqn] # locate the top sequence
if len(index)==0:
raise SystemExit(seqn + " cannot be located in " + infile)
else:
index = index[0]
out = open(outfile, 'w')
out.write(aligns[index][0] + '\n' + aligns[index][1].strip())
for a in aligns:
if a[0]!=seqn:
out.write('\n' + a[0] + '\n' + a[1].strip())
out.close()



if __name__ == "__main__":
if len(sys.argv)<3:
raise SystemExit("Usage: python raise_fasta.py -in <fasta file> -seq <sequence name>\n\tOptional inputs: -out <output file>")

parser = argparse.ArgumentParser()

parser.add_argument('-in', '--fasta_file', nargs='?')
parser.add_argument('-out', '--out', nargs='?') # output file
parser.add_argument('-seq', '--seqname', nargs='?') # name of the sequence that should be moved to the top

parser.set_defaults(out = '_raised.fasta')
parsed = parser.parse_args()
infile = parsed.__dict__['fasta_file']
filename, file_extension = os.path.splitext(infile)
outfile = parsed.__dict__['out']
if outfile == '_raised.fasta':
outfile = filename + '_raised' + file_extension
seqnanme = '>' + parsed.__dict__['seqname']
raise_seq(infile, outfile, seqname)
8 changes: 8 additions & 0 deletions scripts/tests/examples/a1.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
>template
DERE-T
>t1
CC-CCC
>t2
D-DDDD
>t3
EEE-EE
4 changes: 4 additions & 0 deletions scripts/tests/examples/a2.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>t4
FFF-FFF
>template
--DERET
70 changes: 70 additions & 0 deletions scripts/tests/test_alis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import pytest
import sys
import os
from pathlib import Path

sys.path.append(str(Path(__file__).resolve().parents[1]))

testsdir = str(Path(__file__).resolve().parents[1]) + '/tests/'

@pytest.mark.mrgali
@pytest.mark.parametrize('refs,alis,msa', [(['IGE','IGE','IGE'],
[
[
'I-G-E',
'IRGIS',
'TT---'
], [
'IGE',
'TTT'
], [
'I-G-E',
'F-F--'
]
],
['0: I-G-E', '1: I-G-E', '2: I-G-E',
'3: IRGIS', '4: TT---', '5: T-T-T', '6: F-F--']
)])
def test_pyali_Alignmnet(refs, alis, msa):
from pyali.mrgali import Alignment
a = Alignment.from_reference(refs)
a.merge(0, alis[0])
a.merge(1, alis[1])
a.merge(2, alis[2])
assert str(a).split('\n') == msa


@pytest.mark.scripts
@pytest.mark.parametrize('infile,outfile,refn', [(testsdir + 'examples/a2.fa', testsdir + 'examples/a2_tmp.fa','template')])
def test_raise_seq(infile, outfile, refn):
from raise_aln import raise_seq
assert os.path.isfile(infile) == True, '%s does not exist.'%(infile)
raise_seq(infile, outfile, refn)
assert os.path.isfile(outfile), '%s does not exist.'%(outfile)
with open(outfile) as f:
content = f.read().splitlines()
assert content[0] == '>' + refn, 'The sequence of %s was not raised to the top of the alignments.' % (refn)
os.remove(outfile)

@pytest.mark.scripts
@pytest.mark.parametrize('file_list,outname,width,refn,result', [([testsdir + 'examples/a1.fa', testsdir + 'examples/a2.fa'],
testsdir + 'examples/a1_a2_merged.fa', 80, 'template',
['>template',
'--DERE-T',
'>t1',
'--CC-CCC',
'>t2',
'--D-DDDD',
'>t3',
'--EEE-EE',
'>t4',
'FFF-FF-F']
)])
def test_align_merger(file_list, outname, width, refn, result):
from align_merger import align_merger
align_merger(file_list, outname, width, refn)
assert os.path.isfile(outname), 'No output was produced'
with open(outname) as f:
content = f.read().splitlines()
assert content == result

3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from distutils.core import setup
from setuptools import setup
setup(
name = 'pyali',
packages = ['pyali'], # this must be the same as the name above
python_requires = '>=3',
install_requires = ['pandas', 'numpy', 'simplejson'],
version = '0.1.1',
description = 'A package for merging alignments',
Expand Down