diff --git a/.gitignore b/.gitignore index a61b590..d893d25 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,8 @@ **/*.swp **/*.pyc **/.vscode + +.eggs/ +build/ +dist/ +pyali.egg-info/ diff --git a/README.md b/README.md index 9a60a74..e573be4 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,14 @@ -# pyali -A library for manipulating multiple sequence alignments (bioinformatics, structural biology) +# pyali +A library for manipulating multiple sequence alignments (bioinformatics, structural biology) + +# Quickstart +We recommend you install `pyali` into its own virtual environment. + +For example, if you use `pyenv`, you would execute the following commands: +* `pyenv virtualenv 3.8.2 pyali-3.8.2` +* `pyenv activate pyali-3.8.2` +* `python setup.py install` OR `pip install .` + +You may also install `pyali` into an existing virtual environment for another project: +* `pyenv activate YOUR-PROJECT` +* `python setup.py install` OR `pip install .` diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..c2b7416 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +markers = + mrgali: tests for code written by Chris Tang + scripts: tests for code written by Antoniya Aleksandrova diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..7077d54 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,15 @@ +# Working with FASTA files +The scripts here use **mrgali** (pyali) but can pre- and post-process files to make it easier to work directly with fasta files. + +`python raise_aln.py -in -out -seq ` + +will move the alignment specified by the -seq tag to the top of a fasta file. + + +`python align_merger.py -in -out -width -ref ` + +takes a list of alignment fasta files that share one common sequence (-ref) and merges them into a single multiple sequence alignment. See *tests/examples* for an example. + +Tests can be used by running `pytest -v`. + +If you're using the scripts separately from the repo, make sure that you've installed *pyali* and that you've changed the dependency, as specified, in *align_merger.py* diff --git a/scripts/align_merger.py b/scripts/align_merger.py new file mode 100644 index 0000000..a56dbf4 --- /dev/null +++ b/scripts/align_merger.py @@ -0,0 +1,127 @@ +# Author: Antoniya A. Aleksandrova +# Language: Python 3 +# Description: Takes pairwise alignments and outputs a multiple alignment +# Usage: python align_merger.py -in -out -width -ref + +import argparse +import glob +import os +import sys + +try: + from pyali.mrgali import Alignment +except: + print("[ERROR]: pyali has not been installed. To install, run `python setup.py install` from project directory.") +from raise_aln import raise_seq + + +def align_merger(file_list, outname, width, reference_seq): + refs, alis, alis_names = [], [], [] + for f in file_list: + if reference_seq != '': + raise_seq(f, f + '.tmp', reference_seq) + alignment = open(f + '.tmp', 'r') + else: + alignment = open(f, 'r') + flag = 0 + sequence1 = "" + sequence2 = "" + alis_element = [] + for line in alignment: + if line.startswith(">") and flag == 2: + alis_element.append(sequence2) + alis_names.append(name2) + sequence2 = "" + name2 = line.strip() + if line.startswith(">") and flag == 1: + flag = 2 + alis_element.append(sequence1) + name2 = line.strip() + if line.startswith(">") and flag == 0: + flag = 1 + name1 = line.strip() + if not line.startswith(">") and flag == 1: + sequence1 = sequence1 + line.strip().upper() + if not line.startswith(">") and flag == 2: + sequence2 = sequence2 + line.strip().upper() + alignment.close() + if reference_seq != '': + os.remove(f + '.tmp') + alis_element.append(sequence2) + alis.append(alis_element) + alis_names.append(name2) + + refs = [''.join([s for s in seqs[0] if s != '-']) for seqs in alis] + if refs.count(refs[0]) != len(refs): + print("The reference sequences in all the provided alignments are not identical.") + for i, r in enumerate(refs[1:]): + for j, s in enumerate(refs[0]): + if s!=refs[i+1][j]: + print(file_list[0] +": (" + str(j) + "," + s + "), " + file_list[i+1] + ": " + r[j]) + raise SystemExit("References need to be the same to proceed.") + + a = Alignment.from_reference(refs) + for i in range(len(alis)): + a.merge(i, alis[i]) + + flds = str(a).split('\n') + + aligned_list = [] + out = open(outname, 'w') + for i, ln in enumerate(flds): + if i == 0: + s = ln[ln.index(':') + 2:] + out.write(name1 + '\n') + aligned_list.append((name1, s)) + while len(s) > 0: + out.write(s[:width] + '\n') + s = s[width:] + if i >= len(refs): + s = ln[ln.index(':') + 2:] + out.write(alis_names[i - len(refs)] + '\n') + aligned_list.append((alis_names[i - len(refs)], s)) + while len(s) > 0: + out.write(s[:width] + '\n') + s = s[width:] + out.close() + return aligned_list + + +if __name__ == "__main__": + + # Remove previously generated merged alignments + if os.path.isfile('merged_alignments.fasta'): + os.remove('merged_alignments.fasta') + + # Parse user input and set defaults + parser = argparse.ArgumentParser() + + parser.add_argument('-in', '--list_of_fasta_files', nargs='?') # file with a list of fasta files (one per line) + parser.add_argument('-out', '--out', nargs='?') # output file + parser.add_argument('-width', '--width', nargs='?') # fasta line width + parser.add_argument('-ref', '--reference', nargs='?') # name of the reference sequence + + parser.set_defaults(list_of_fasta_files=glob.glob('*.fasta')) + parser.set_defaults(out='merged_alignments.fasta') + parser.set_defaults(width='72') + parser.set_defaults(reference='') + + parsed = parser.parse_args() + if type(parsed.__dict__['list_of_fasta_files']) == list: + file_list = parsed.__dict__['list_of_fasta_files'] + print("[INFO]: Pairwise alignments will be taken from all fasta files in current directory.") + else: + with open(parsed.__dict__['list_of_fasta_files'], 'r') as f: + file_list = f.read().strip().split('\n') + print("[INFO]: Pairwise alignment files read from " + parsed.__dict__['list_of_fasta_files'] + ".") + print("[INFO]: The following files were read: " + ', '.join(file_list)) + if len(file_list) == 0: + raise SystemExit("No fasta files available for alignment.") + outname = parsed.__dict__['out'] + print("[INFO]: Merged alignment will be written to " + outname + ".") + width = int(parsed.__dict__['width']) + ref_seq = parsed.__dict__['reference'] + + align_merger(file_list, outname, width, ref_seq) + + print("[INFO]: Done.") diff --git a/scripts/raise_aln.py b/scripts/raise_aln.py new file mode 100644 index 0000000..29cd246 --- /dev/null +++ b/scripts/raise_aln.py @@ -0,0 +1,60 @@ +# Author: Antoniya A. Aleksandrova +# Language: Python 3 +# Description: Move a specified alignment to the top of a fasta file +# Usage: python raise_aln.py -in -out -seq + +import os +import sys +import argparse + +def raise_seq(infile, outfile, seqn): + aligns = [] + name = "" + seq = "" + if '>' not in seqn: + seqn = '>' + seqn + f = open(infile, 'r') + for line in f: + if line.startswith(">"): + if seq != "": + aligns.append((name, seq)) + name = line.strip() + seq = "" + elif not line.startswith("#"): + seq = seq + line + aligns.append((name, seq)) + f.close() + + index = [x for x, y in enumerate(aligns) if y[0] == seqn] # locate the top sequence + if len(index)==0: + raise SystemExit(seqn + " cannot be located in " + infile) + else: + index = index[0] + out = open(outfile, 'w') + out.write(aligns[index][0] + '\n' + aligns[index][1].strip()) + for a in aligns: + if a[0]!=seqn: + out.write('\n' + a[0] + '\n' + a[1].strip()) + out.close() + + + +if __name__ == "__main__": + if len(sys.argv)<3: + raise SystemExit("Usage: python raise_fasta.py -in -seq \n\tOptional inputs: -out ") + + parser = argparse.ArgumentParser() + + parser.add_argument('-in', '--fasta_file', nargs='?') + parser.add_argument('-out', '--out', nargs='?') # output file + parser.add_argument('-seq', '--seqname', nargs='?') # name of the sequence that should be moved to the top + + parser.set_defaults(out = '_raised.fasta') + parsed = parser.parse_args() + infile = parsed.__dict__['fasta_file'] + filename, file_extension = os.path.splitext(infile) + outfile = parsed.__dict__['out'] + if outfile == '_raised.fasta': + outfile = filename + '_raised' + file_extension + seqnanme = '>' + parsed.__dict__['seqname'] + raise_seq(infile, outfile, seqname) diff --git a/scripts/tests/examples/a1.fa b/scripts/tests/examples/a1.fa new file mode 100644 index 0000000..784d572 --- /dev/null +++ b/scripts/tests/examples/a1.fa @@ -0,0 +1,8 @@ +>template +DERE-T +>t1 +CC-CCC +>t2 +D-DDDD +>t3 +EEE-EE diff --git a/scripts/tests/examples/a2.fa b/scripts/tests/examples/a2.fa new file mode 100644 index 0000000..d5b86a2 --- /dev/null +++ b/scripts/tests/examples/a2.fa @@ -0,0 +1,4 @@ +>t4 +FFF-FFF +>template +--DERET diff --git a/scripts/tests/test_alis.py b/scripts/tests/test_alis.py new file mode 100644 index 0000000..c21a72b --- /dev/null +++ b/scripts/tests/test_alis.py @@ -0,0 +1,70 @@ +import pytest +import sys +import os +from pathlib import Path + +sys.path.append(str(Path(__file__).resolve().parents[1])) + +testsdir = str(Path(__file__).resolve().parents[1]) + '/tests/' + +@pytest.mark.mrgali +@pytest.mark.parametrize('refs,alis,msa', [(['IGE','IGE','IGE'], + [ + [ + 'I-G-E', + 'IRGIS', + 'TT---' + ], [ + 'IGE', + 'TTT' + ], [ + 'I-G-E', + 'F-F--' + ] + ], + ['0: I-G-E', '1: I-G-E', '2: I-G-E', + '3: IRGIS', '4: TT---', '5: T-T-T', '6: F-F--'] + )]) +def test_pyali_Alignmnet(refs, alis, msa): + from pyali.mrgali import Alignment + a = Alignment.from_reference(refs) + a.merge(0, alis[0]) + a.merge(1, alis[1]) + a.merge(2, alis[2]) + assert str(a).split('\n') == msa + + +@pytest.mark.scripts +@pytest.mark.parametrize('infile,outfile,refn', [(testsdir + 'examples/a2.fa', testsdir + 'examples/a2_tmp.fa','template')]) +def test_raise_seq(infile, outfile, refn): + from raise_aln import raise_seq + assert os.path.isfile(infile) == True, '%s does not exist.'%(infile) + raise_seq(infile, outfile, refn) + assert os.path.isfile(outfile), '%s does not exist.'%(outfile) + with open(outfile) as f: + content = f.read().splitlines() + assert content[0] == '>' + refn, 'The sequence of %s was not raised to the top of the alignments.' % (refn) + os.remove(outfile) + +@pytest.mark.scripts +@pytest.mark.parametrize('file_list,outname,width,refn,result', [([testsdir + 'examples/a1.fa', testsdir + 'examples/a2.fa'], + testsdir + 'examples/a1_a2_merged.fa', 80, 'template', + ['>template', + '--DERE-T', + '>t1', + '--CC-CCC', + '>t2', + '--D-DDDD', + '>t3', + '--EEE-EE', + '>t4', + 'FFF-FF-F'] + )]) +def test_align_merger(file_list, outname, width, refn, result): + from align_merger import align_merger + align_merger(file_list, outname, width, refn) + assert os.path.isfile(outname), 'No output was produced' + with open(outname) as f: + content = f.read().splitlines() + assert content == result + diff --git a/setup.py b/setup.py index cd80236..1fdc3c6 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,8 @@ -from distutils.core import setup +from setuptools import setup setup( name = 'pyali', packages = ['pyali'], # this must be the same as the name above + python_requires = '>=3', install_requires = ['pandas', 'numpy', 'simplejson'], version = '0.1.1', description = 'A package for merging alignments',