From 9179d64958071777b432a89e2b54d7c715dbc8d8 Mon Sep 17 00:00:00 2001 From: Aleksandrova Date: Mon, 2 Mar 2020 17:54:26 -0500 Subject: [PATCH 1/9] added scripts --- scripts/align_merger.py | 126 ++++++++++++++++++++++++++++++++++++++++ scripts/raise_aln.py | 60 +++++++++++++++++++ scripts/tests/test.py | 27 +++++++++ 3 files changed, 213 insertions(+) create mode 100644 scripts/align_merger.py create mode 100644 scripts/raise_aln.py create mode 100644 scripts/tests/test.py diff --git a/scripts/align_merger.py b/scripts/align_merger.py new file mode 100644 index 0000000..1cfc540 --- /dev/null +++ b/scripts/align_merger.py @@ -0,0 +1,126 @@ +# Author: Antoniya A. Aleksandrova +# Language: Python 3.5/2.7 +# Description: Takes pairwise alignments and outputs a multiple alignment +# Usage: python align_merger.py -in -out -width -ref + +import os +import numpy as np +import pandas as pd +import simplejson +import glob +from pyali.mrgali import * +import sys +import argparse +from raise_aln import raise_seq + + +def align_merger(file_list, outname, width, reference_seq): + refs, alis, alis_names = [], [], [] + for f in file_list: + if reference_seq != '': + raise_seq(f, f + '.tmp', reference_seq) + alignment = open(f + '.tmp', 'r') + else: + alignment = open(f, 'r') + flag = 0 + sequence1 = "" + sequence2 = "" + alis_element = [] + for line in alignment: + if line.startswith(">") and flag == 2: + alis_element.append(sequence2) + alis_names.append(name2) + sequence2 = "" + name2 = line.strip() + if line.startswith(">") and flag == 1: + flag = 2 + alis_element.append(sequence1) + name2 = line.strip() + if line.startswith(">") and flag == 0: + flag = 1 + name1 = line.strip() + if not line.startswith(">") and flag == 1: + sequence1 = sequence1 + line.strip().upper() + if not line.startswith(">") and flag == 2: + sequence2 = sequence2 + line.strip().upper() + alignment.close() + if reference_seq != '': + os.remove(f + '.tmp') + alis_element.append(sequence2) + alis.append(alis_element) + alis_names.append(name2) + + refs = [''.join([s for s in seqs[0] if s != '-']) for seqs in alis] + if refs.count(refs[0]) != len(refs): + print("The reference sequences in all the provided alignments are not identical.") + for i, r in enumerate(refs[1:]): + for j, s in enumerate(refs[0]): + if s!=refs[i+1][j]: + print(file_list[0] +": (" + str(j) + "," + s + "), " + file_list[i+1] + ": " + r[j]) + raise SystemExit("References need to be the same to proceed.") + + a = Alignment.from_reference(refs) + for i in range(len(alis)): + a.merge(i, alis[i]) + + flds = str(a).split('\n') + + aligned_list = [] + out = open(outname, 'w') + for i, ln in enumerate(flds): + if i == 0: + s = ln[ln.index(':') + 2:] + out.write(name1 + '\n') + aligned_list.append((name1, s)) + while len(s) > 0: + out.write(s[:width] + '\n') + s = s[width:] + if i >= len(refs): + s = ln[ln.index(':') + 2:] + out.write(alis_names[i - len(refs)] + '\n') + aligned_list.append((alis_names[i - len(refs)], s)) + while len(s) > 0: + out.write(s[:width] + '\n') + s = s[width:] + out.close() + return aligned_list + + +if __name__ == "__main__": + + # Remove previously generated merged alignments + if os.path.isfile('merged_alignments.fasta'): + os.remove('merged_alignments.fasta') + + # Parse user input and set defaults + parser = argparse.ArgumentParser() + + parser.add_argument('-in', '--list_of_fasta_files', nargs='?') # file with a list of fasta files (one per line) + parser.add_argument('-out', '--out', nargs='?') # output file + parser.add_argument('-width', '--width', nargs='?') # fasta line width + parser.add_argument('-ref', '--reference', nargs='?') # name of the reference sequence + + parser.set_defaults(list_of_fasta_files=glob.glob('*.fasta')) + parser.set_defaults(out='merged_alignments.fasta') + parser.set_defaults(width='72') + parser.set_defaults(reference='') + + parsed = parser.parse_args() + if type(parsed.__dict__['list_of_fasta_files']) == list: + file_list = parsed.__dict__['list_of_fasta_files'] + print("[INFO]: Pairwise alignments will be taken from all fasta files in current directory.") + else: + with open(parsed.__dict__['list_of_fasta_files'], 'r') as f: + file_list = f.read().strip().split('\n') + print("[INFO]: Pairwise alignment files read from " + parsed.__dict__['list_of_fasta_files'] + ".") + print("[INFO]: The following files were read: " + ', '.join(file_list)) + if len(file_list) == 0: + raise SystemExit("No fasta files available for alignment.") + outname = parsed.__dict__['out'] + print("[INFO]: Merged alignment will be written to " + outname + ".") + width = int(parsed.__dict__['width']) + ref_seq = parsed.__dict__['reference'] + + align_merger(file_list, outname, width, ref_seq) + + print("[INFO]: Done.") diff --git a/scripts/raise_aln.py b/scripts/raise_aln.py new file mode 100644 index 0000000..e5c5d55 --- /dev/null +++ b/scripts/raise_aln.py @@ -0,0 +1,60 @@ +# Author: Antoniya A. Aleksandrova +# Language: Python 3.5/2.7 +# Description: Move a specified alignment to the top of a fasta file +# Usage: python raise_aln.py -in -out -seq + +import os +import sys +import argparse + +def raise_seq(infile, outfile, seqn): + aligns = [] + name = "" + seq = "" + if '>' not in seqn: + seqn = '>' + seqn + f = open(infile, 'r') + for line in f: + if line.startswith(">"): + if seq != "": + aligns.append((name, seq)) + name = line.strip() + seq = "" + elif not line.startswith("#"): + seq = seq + line + aligns.append((name, seq)) + f.close() + + index = [x for x, y in enumerate(aligns) if y[0] == seqn] # locate the top sequence + if len(index)==0: + raise SystemExit(seqn + " cannot be located in " + infile) + else: + index = index[0] + out = open(outfile, 'w') + out.write(aligns[index][0] + '\n' + aligns[index][1].strip()) + for a in aligns: + if a[0]!=seqn: + out.write('\n' + a[0] + '\n' + a[1].strip()) + out.close() + + + +if __name__ == "__main__": + if len(sys.argv)<3: + raise SystemExit("Usage: python raise_fasta.py -in -seq \n\tOptional inputs: -out ") + + parser = argparse.ArgumentParser() + + parser.add_argument('-in', '--fasta_file', nargs='?') + parser.add_argument('-out', '--out', nargs='?') # output file + parser.add_argument('-seq', '--seqname', nargs='?') # name of the sequence that should be moved to the top + + parser.set_defaults(out = '_raised.fasta') + parsed = parser.parse_args() + infile = parsed.__dict__['fasta_file'] + filename, file_extension = os.path.splitext(infile) + outfile = parsed.__dict__['out'] + if outfile == '_raised.fasta': + outfile = filename + '_raised' + file_extension + seqnanme = '>' + parsed.__dict__['seqname'] + raise_seq(infile, outfile, seqname) \ No newline at end of file diff --git a/scripts/tests/test.py b/scripts/tests/test.py new file mode 100644 index 0000000..8a2b339 --- /dev/null +++ b/scripts/tests/test.py @@ -0,0 +1,27 @@ +import pytest + +@pytest.mark.parametrize('refs,alis,msa', [(['IGE','IGE','IGE'], + [ + [ + 'I-G-E', + 'IRGIS', + 'TT---' + ], [ + 'IGE', + 'TTT' + ], [ + 'I-G-E', + 'F-F--' + ] + ], + ['0: I-G-E', '1: I-G-E', '2: I-G-E', + '3: IRGIS', '4: TT---', '5: T-T-T', '6: F-F--'] + )]) +def test_pyali_Alignmnet(refs, alis, msa): + from pyali.mrgali import * + a = Alignment.from_reference(refs) + a.merge(0, alis[0]) + a.merge(1, alis[1]) + a.merge(2, alis[2]) + assert str(a).split('\n') == msa + From 7af44e865e11823745948d96e535d0a7b7ed95d2 Mon Sep 17 00:00:00 2001 From: Antoniya Aleksandrova Date: Tue, 3 Mar 2020 19:01:28 -0500 Subject: [PATCH 2/9] pytest added --- scripts/align_merger.py | 12 +++--- scripts/raise_aln.py | 2 +- scripts/tests/examples/a1.fa | 8 ++++ scripts/tests/examples/a2.fa | 4 ++ scripts/tests/test_alis.py | 71 ++++++++++++++++++++++++++++++++++++ 5 files changed, 91 insertions(+), 6 deletions(-) create mode 100644 scripts/tests/examples/a1.fa create mode 100644 scripts/tests/examples/a2.fa create mode 100644 scripts/tests/test_alis.py diff --git a/scripts/align_merger.py b/scripts/align_merger.py index 1cfc540..d4f0873 100644 --- a/scripts/align_merger.py +++ b/scripts/align_merger.py @@ -1,15 +1,17 @@ # Author: Antoniya A. Aleksandrova -# Language: Python 3.5/2.7 +# Language: Python 3 # Description: Takes pairwise alignments and outputs a multiple alignment # Usage: python align_merger.py -in -out -width -ref import os -import numpy as np -import pandas as pd -import simplejson import glob -from pyali.mrgali import * import sys +# Use code between #s if you've cloned the github repository; otherwsise, as a standalone with pyali installed, replace it with #from pyali.mrgali import * +### +from pathlib import Path +sys.path.append(str(Path(__file__).resolve().parents[1]) + '/pyali') +from mrgali import Alignment +### import argparse from raise_aln import raise_seq diff --git a/scripts/raise_aln.py b/scripts/raise_aln.py index e5c5d55..efc81c0 100644 --- a/scripts/raise_aln.py +++ b/scripts/raise_aln.py @@ -1,5 +1,5 @@ # Author: Antoniya A. Aleksandrova -# Language: Python 3.5/2.7 +# Language: Python 3 # Description: Move a specified alignment to the top of a fasta file # Usage: python raise_aln.py -in -out -seq diff --git a/scripts/tests/examples/a1.fa b/scripts/tests/examples/a1.fa new file mode 100644 index 0000000..784d572 --- /dev/null +++ b/scripts/tests/examples/a1.fa @@ -0,0 +1,8 @@ +>template +DERE-T +>t1 +CC-CCC +>t2 +D-DDDD +>t3 +EEE-EE diff --git a/scripts/tests/examples/a2.fa b/scripts/tests/examples/a2.fa new file mode 100644 index 0000000..d5b86a2 --- /dev/null +++ b/scripts/tests/examples/a2.fa @@ -0,0 +1,4 @@ +>t4 +FFF-FFF +>template +--DERET diff --git a/scripts/tests/test_alis.py b/scripts/tests/test_alis.py new file mode 100644 index 0000000..b2b20ca --- /dev/null +++ b/scripts/tests/test_alis.py @@ -0,0 +1,71 @@ +import pytest +import sys +import os +from pathlib import Path + +sys.path.append(str(Path(__file__).resolve().parents[1])) +sys.path.append(str(Path(__file__).resolve().parents[2]) + '/pyali') + +testsdir = str(Path(__file__).resolve().parents[1]) + '/tests/' + +@pytest.mark.mrgali +@pytest.mark.parametrize('refs,alis,msa', [(['IGE','IGE','IGE'], + [ + [ + 'I-G-E', + 'IRGIS', + 'TT---' + ], [ + 'IGE', + 'TTT' + ], [ + 'I-G-E', + 'F-F--' + ] + ], + ['0: I-G-E', '1: I-G-E', '2: I-G-E', + '3: IRGIS', '4: TT---', '5: T-T-T', '6: F-F--'] + )]) +def test_pyali_Alignmnet(refs, alis, msa): + from mrgali import Alignment + a = Alignment.from_reference(refs) + a.merge(0, alis[0]) + a.merge(1, alis[1]) + a.merge(2, alis[2]) + assert str(a).split('\n') == msa + + +@pytest.mark.scripts +@pytest.mark.parametrize('infile,outfile,refn', [(testsdir + 'examples/a2.fa', testsdir + 'examples/a2_tmp.fa','template')]) +def test_raise_seq(infile, outfile, refn): + from raise_aln import raise_seq + assert os.path.isfile(infile) == True, '%s does not exist.'%(infile) + raise_seq(infile, outfile, refn) + assert os.path.isfile(outfile), '%s does not exist.'%(outfile) + with open(outfile) as f: + content = f.read().splitlines() + assert content[0] == '>' + refn, 'The sequence of %s was not raised to the top of the alignments.' % (refn) + os.remove(outfile) + +@pytest.mark.scripts +@pytest.mark.parametrize('file_list,outname,width,refn,result', [([testsdir + 'examples/a1.fa', testsdir + 'examples/a2.fa'], + testsdir + 'examples/a1_a2_merged.fa', 80, 'template', + ['>template', + '--DERE-T', + '>t1', + '--CC-CCC', + '>t2', + '--D-DDDD', + '>t3', + '--EEE-EE', + '>t4', + 'FFF-FF-F'] + )]) +def test_align_merger(file_list, outname, width, refn, result): + from align_merger import align_merger + align_merger(file_list, outname, width, refn) + assert os.path.isfile(outname), 'No output was produced' + with open(outname) as f: + content = f.read().splitlines() + assert content == result + From af9de3e40bb8864b2bc749bc051bdeff93ac6817 Mon Sep 17 00:00:00 2001 From: Antoniya Aleksandrova Date: Tue, 3 Mar 2020 19:03:10 -0500 Subject: [PATCH 3/9] remove old version --- scripts/tests/test.py | 27 --------------------------- 1 file changed, 27 deletions(-) delete mode 100644 scripts/tests/test.py diff --git a/scripts/tests/test.py b/scripts/tests/test.py deleted file mode 100644 index 8a2b339..0000000 --- a/scripts/tests/test.py +++ /dev/null @@ -1,27 +0,0 @@ -import pytest - -@pytest.mark.parametrize('refs,alis,msa', [(['IGE','IGE','IGE'], - [ - [ - 'I-G-E', - 'IRGIS', - 'TT---' - ], [ - 'IGE', - 'TTT' - ], [ - 'I-G-E', - 'F-F--' - ] - ], - ['0: I-G-E', '1: I-G-E', '2: I-G-E', - '3: IRGIS', '4: TT---', '5: T-T-T', '6: F-F--'] - )]) -def test_pyali_Alignmnet(refs, alis, msa): - from pyali.mrgali import * - a = Alignment.from_reference(refs) - a.merge(0, alis[0]) - a.merge(1, alis[1]) - a.merge(2, alis[2]) - assert str(a).split('\n') == msa - From 2ed42ab3beb98f60dda73e686138fc8705742566 Mon Sep 17 00:00:00 2001 From: Antoniya Aleksandrova Date: Tue, 3 Mar 2020 19:22:11 -0500 Subject: [PATCH 4/9] README added --- scripts/README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 scripts/README.md diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..5886ded --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,15 @@ +# Working with FASTA files +The scripts here use **mrgali** (pyali) but can pre- and post-process files to make it easier to work directly with fasta files. + +`python raise_aln.py -in -out -seq ` + +will move the alignment specified by the -seq tag to the top of a fasta file. + + +`python align_merger.py -in -out -width -ref ` + +takes a list of alignment fasta files that share one common sequence (-ref) and merges them into a single multiple sequence alignment. See *tests/examples* for an example. + +Tests can be used by running `pytest -v`. + +If you're using the scripts separately from the repo, make sure that you've installed *pyali* and that you've changed the dependency, as specified, in *align_merger.py* \ No newline at end of file From 6ed8993b1c9cb9f92e36332767977bec8f395d44 Mon Sep 17 00:00:00 2001 From: Antoniya Aleksandrova Date: Fri, 13 Mar 2020 15:30:24 -0400 Subject: [PATCH 5/9] pytest.ini added --- pytest.ini | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 pytest.ini diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..c2b7416 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +markers = + mrgali: tests for code written by Chris Tang + scripts: tests for code written by Antoniya Aleksandrova From c64dd0b1613f0abff2f139002be1c18ac7bd4c46 Mon Sep 17 00:00:00 2001 From: Chris Tang Date: Thu, 9 Apr 2020 07:41:33 -0400 Subject: [PATCH 6/9] remove extra whitespace --- scripts/README.md | 8 ++++---- scripts/align_merger.py | 2 +- scripts/raise_aln.py | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/scripts/README.md b/scripts/README.md index 5886ded..7077d54 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -1,15 +1,15 @@ -# Working with FASTA files +# Working with FASTA files The scripts here use **mrgali** (pyali) but can pre- and post-process files to make it easier to work directly with fasta files. -`python raise_aln.py -in -out -seq ` +`python raise_aln.py -in -out -seq ` will move the alignment specified by the -seq tag to the top of a fasta file. `python align_merger.py -in -out -width -ref ` -takes a list of alignment fasta files that share one common sequence (-ref) and merges them into a single multiple sequence alignment. See *tests/examples* for an example. +takes a list of alignment fasta files that share one common sequence (-ref) and merges them into a single multiple sequence alignment. See *tests/examples* for an example. Tests can be used by running `pytest -v`. -If you're using the scripts separately from the repo, make sure that you've installed *pyali* and that you've changed the dependency, as specified, in *align_merger.py* \ No newline at end of file +If you're using the scripts separately from the repo, make sure that you've installed *pyali* and that you've changed the dependency, as specified, in *align_merger.py* diff --git a/scripts/align_merger.py b/scripts/align_merger.py index d4f0873..cce0a95 100644 --- a/scripts/align_merger.py +++ b/scripts/align_merger.py @@ -1,6 +1,6 @@ # Author: Antoniya A. Aleksandrova # Language: Python 3 -# Description: Takes pairwise alignments and outputs a multiple alignment +# Description: Takes pairwise alignments and outputs a multiple alignment # Usage: python align_merger.py -in -out -width -ref import os diff --git a/scripts/raise_aln.py b/scripts/raise_aln.py index efc81c0..29cd246 100644 --- a/scripts/raise_aln.py +++ b/scripts/raise_aln.py @@ -1,7 +1,7 @@ # Author: Antoniya A. Aleksandrova # Language: Python 3 -# Description: Move a specified alignment to the top of a fasta file -# Usage: python raise_aln.py -in -out -seq +# Description: Move a specified alignment to the top of a fasta file +# Usage: python raise_aln.py -in -out -seq import os import sys @@ -24,7 +24,7 @@ def raise_seq(infile, outfile, seqn): seq = seq + line aligns.append((name, seq)) f.close() - + index = [x for x, y in enumerate(aligns) if y[0] == seqn] # locate the top sequence if len(index)==0: raise SystemExit(seqn + " cannot be located in " + infile) @@ -57,4 +57,4 @@ def raise_seq(infile, outfile, seqn): if outfile == '_raised.fasta': outfile = filename + '_raised' + file_extension seqnanme = '>' + parsed.__dict__['seqname'] - raise_seq(infile, outfile, seqname) \ No newline at end of file + raise_seq(infile, outfile, seqname) From 902e72612314f55a999e28f2fcb6facb73d46d15 Mon Sep 17 00:00:00 2001 From: Chris Tang Date: Thu, 9 Apr 2020 09:59:24 -0400 Subject: [PATCH 7/9] allow setup.py to install dependencies properly The setuptools module properly recognizes the install_requires directive. This allows users to properly install all requirements using `python setup.py install` --- .gitignore | 5 +++++ setup.py | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index a61b590..d893d25 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,8 @@ **/*.swp **/*.pyc **/.vscode + +.eggs/ +build/ +dist/ +pyali.egg-info/ diff --git a/setup.py b/setup.py index cd80236..1fdc3c6 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,8 @@ -from distutils.core import setup +from setuptools import setup setup( name = 'pyali', packages = ['pyali'], # this must be the same as the name above + python_requires = '>=3', install_requires = ['pandas', 'numpy', 'simplejson'], version = '0.1.1', description = 'A package for merging alignments', From e045629b9cbcd535c72164246bf8aeb52a4f8835 Mon Sep 17 00:00:00 2001 From: Chris Tang Date: Thu, 9 Apr 2020 10:03:02 -0400 Subject: [PATCH 8/9] update README with Quickstart instructions Users are advised to install pyali before using scripts in the project. --- README.md | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9a60a74..e573be4 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,14 @@ -# pyali -A library for manipulating multiple sequence alignments (bioinformatics, structural biology) +# pyali +A library for manipulating multiple sequence alignments (bioinformatics, structural biology) + +# Quickstart +We recommend you install `pyali` into its own virtual environment. + +For example, if you use `pyenv`, you would execute the following commands: +* `pyenv virtualenv 3.8.2 pyali-3.8.2` +* `pyenv activate pyali-3.8.2` +* `python setup.py install` OR `pip install .` + +You may also install `pyali` into an existing virtual environment for another project: +* `pyenv activate YOUR-PROJECT` +* `python setup.py install` OR `pip install .` From 14ffb24028b77fe8be8cb0e2be7372f0be5f209a Mon Sep 17 00:00:00 2001 From: Chris Tang Date: Thu, 9 Apr 2020 10:04:53 -0400 Subject: [PATCH 9/9] make scripts consistent with installed pyali Scripts should expect pyali to be installed. Otherwise, the user should see a helpful error message if this has not been done. --- scripts/align_merger.py | 15 +++++++-------- scripts/tests/test_alis.py | 3 +-- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/scripts/align_merger.py b/scripts/align_merger.py index cce0a95..a56dbf4 100644 --- a/scripts/align_merger.py +++ b/scripts/align_merger.py @@ -3,16 +3,15 @@ # Description: Takes pairwise alignments and outputs a multiple alignment # Usage: python align_merger.py -in -out -width -ref -import os +import argparse import glob +import os import sys -# Use code between #s if you've cloned the github repository; otherwsise, as a standalone with pyali installed, replace it with #from pyali.mrgali import * -### -from pathlib import Path -sys.path.append(str(Path(__file__).resolve().parents[1]) + '/pyali') -from mrgali import Alignment -### -import argparse + +try: + from pyali.mrgali import Alignment +except: + print("[ERROR]: pyali has not been installed. To install, run `python setup.py install` from project directory.") from raise_aln import raise_seq diff --git a/scripts/tests/test_alis.py b/scripts/tests/test_alis.py index b2b20ca..c21a72b 100644 --- a/scripts/tests/test_alis.py +++ b/scripts/tests/test_alis.py @@ -4,7 +4,6 @@ from pathlib import Path sys.path.append(str(Path(__file__).resolve().parents[1])) -sys.path.append(str(Path(__file__).resolve().parents[2]) + '/pyali') testsdir = str(Path(__file__).resolve().parents[1]) + '/tests/' @@ -27,7 +26,7 @@ '3: IRGIS', '4: TT---', '5: T-T-T', '6: F-F--'] )]) def test_pyali_Alignmnet(refs, alis, msa): - from mrgali import Alignment + from pyali.mrgali import Alignment a = Alignment.from_reference(refs) a.merge(0, alis[0]) a.merge(1, alis[1])