christang · AntoniyaAleksandrova · Mar 2, 2020 · Mar 4, 2020 · Mar 4, 2020 · Mar 4, 2020
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,8 @@
 **/*.swp
 **/*.pyc
 **/.vscode
+
+.eggs/
+build/
+dist/
+pyali.egg-info/
diff --git a/README.md b/README.md
@@ -1,2 +1,14 @@
-# pyali
-A library for manipulating multiple sequence alignments (bioinformatics, structural biology)
+# pyali
+A library for manipulating multiple sequence alignments (bioinformatics, structural biology)
+
+# Quickstart
+We recommend you install `pyali` into its own virtual environment.
+
+For example, if you use `pyenv`, you would execute the following commands:
+* `pyenv virtualenv 3.8.2 pyali-3.8.2`
+* `pyenv activate pyali-3.8.2`
+* `python setup.py install` OR `pip install .`
+
+You may also install `pyali` into an existing virtual environment for another project:
+* `pyenv activate YOUR-PROJECT`
+* `python setup.py install` OR `pip install .`
diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,4 @@
+[pytest]
+markers =
+    mrgali: tests for code written by Chris Tang
+    scripts: tests for code written by Antoniya Aleksandrova
diff --git a/scripts/README.md b/scripts/README.md
@@ -0,0 +1,15 @@
+# Working with FASTA files
+The scripts here use **mrgali** (pyali) but can pre- and post-process files to make it easier to work directly with fasta files.
+
+`python raise_aln.py -in <fasta file> -out <output file> -seq <sequence name>`
+
+will move the alignment specified by the -seq tag to the top of a fasta file.
+
+
+`python align_merger.py -in <list with fasta files> -out <output file> -width <width of lines in output fasta> -ref <name of reference sequence>`
+
+takes a list of alignment fasta files that share one common sequence (-ref) and merges them into a single multiple sequence alignment. See *tests/examples* for an example.
+
+Tests can be used by running `pytest -v`.
+
+If you're using the scripts separately from the repo, make sure that you've installed *pyali* and that you've changed the dependency, as specified, in *align_merger.py*
diff --git a/scripts/align_merger.py b/scripts/align_merger.py
@@ -0,0 +1,127 @@
+# Author: Antoniya A. Aleksandrova
+# Language: Python 3
+# Description: Takes pairwise alignments and outputs a multiple alignment
+# Usage: python align_merger.py -in <list with fasta files> -out <output file> -width <width of lines in output fasta> -ref <name of reference sequence>
+
+import argparse
+import glob
+import os
+import sys
+
+try:
+    from pyali.mrgali import Alignment
+except:
+    print("[ERROR]: pyali has not been installed. To install, run `python setup.py install` from project directory.")
+from raise_aln import raise_seq
+
+
+def align_merger(file_list, outname, width, reference_seq):
+    refs, alis, alis_names = [], [], []
+    for f in file_list:
+        if reference_seq != '':
+            raise_seq(f, f + '.tmp', reference_seq)
+            alignment = open(f + '.tmp', 'r')
+        else:
+            alignment = open(f, 'r')
+        flag = 0
+        sequence1 = ""
+        sequence2 = ""
+        alis_element = []
+        for line in alignment:
+            if line.startswith(">") and flag == 2:
+                alis_element.append(sequence2)
+                alis_names.append(name2)
+                sequence2 = ""
+                name2 = line.strip()
+            if line.startswith(">") and flag == 1:
+                flag = 2
+                alis_element.append(sequence1)
+                name2 = line.strip()
+            if line.startswith(">") and flag == 0:
+                flag = 1
+                name1 = line.strip()
+            if not line.startswith(">") and flag == 1:
+                sequence1 = sequence1 + line.strip().upper()
+            if not line.startswith(">") and flag == 2:
+                sequence2 = sequence2 + line.strip().upper()
+        alignment.close()
+        if reference_seq != '':
+            os.remove(f + '.tmp')
+        alis_element.append(sequence2)
+        alis.append(alis_element)
+        alis_names.append(name2)
+
+    refs = [''.join([s for s in seqs[0] if s != '-']) for seqs in alis]
+    if refs.count(refs[0]) != len(refs):
+        print("The reference sequences in all the provided alignments are not identical.")
+        for i, r in enumerate(refs[1:]):
+            for j, s in enumerate(refs[0]):
+                if s!=refs[i+1][j]:
+                    print(file_list[0] +": (" + str(j) + "," + s + "), " + file_list[i+1] + ": " + r[j])
+        raise SystemExit("References need to be the same to proceed.")
+
+    a = Alignment.from_reference(refs)
+    for i in range(len(alis)):
+        a.merge(i, alis[i])
+
+    flds = str(a).split('\n')
+
+    aligned_list = []
+    out = open(outname, 'w')
+    for i, ln in enumerate(flds):
+        if i == 0:
+            s = ln[ln.index(':') + 2:]
+            out.write(name1 + '\n')
+            aligned_list.append((name1, s))
+            while len(s) > 0:
+                out.write(s[:width] + '\n')
+                s = s[width:]
+        if i >= len(refs):
+            s = ln[ln.index(':') + 2:]
+            out.write(alis_names[i - len(refs)] + '\n')
+            aligned_list.append((alis_names[i - len(refs)], s))
+            while len(s) > 0:
+                out.write(s[:width] + '\n')
+                s = s[width:]
+    out.close()
+    return aligned_list
+
+
+if __name__ == "__main__":
+
+    # Remove previously generated merged alignments
+    if os.path.isfile('merged_alignments.fasta'):
+        os.remove('merged_alignments.fasta')
+
+    # Parse user input and set defaults
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('-in', '--list_of_fasta_files', nargs='?')  # file with a list of fasta files (one per line)
+    parser.add_argument('-out', '--out', nargs='?')  # output file
+    parser.add_argument('-width', '--width', nargs='?')  # fasta line width
+    parser.add_argument('-ref', '--reference', nargs='?')  # name of the reference sequence
+
+    parser.set_defaults(list_of_fasta_files=glob.glob('*.fasta'))
+    parser.set_defaults(out='merged_alignments.fasta')
+    parser.set_defaults(width='72')
+    parser.set_defaults(reference='')
+
+    parsed = parser.parse_args()
+    if type(parsed.__dict__['list_of_fasta_files']) == list:
+        file_list = parsed.__dict__['list_of_fasta_files']
+        print("[INFO]: Pairwise alignments will be taken from all fasta files in current directory.")
+    else:
+        with open(parsed.__dict__['list_of_fasta_files'], 'r') as f:
+            file_list = f.read().strip().split('\n')
+        print("[INFO]: Pairwise alignment files read from " + parsed.__dict__['list_of_fasta_files'] + ".")
+    print("[INFO]: The following files were read: " + ', '.join(file_list))
+    if len(file_list) == 0:
+        raise SystemExit("No fasta files available for alignment.")
+    outname = parsed.__dict__['out']
+    print("[INFO]: Merged alignment will be written to " + outname + ".")
+    width = int(parsed.__dict__['width'])
+    ref_seq = parsed.__dict__['reference']
+
+    align_merger(file_list, outname, width, ref_seq)
+
+    print("[INFO]: Done.")
diff --git a/scripts/raise_aln.py b/scripts/raise_aln.py
@@ -0,0 +1,60 @@
+# Author: Antoniya A. Aleksandrova
+# Language: Python 3
+# Description: Move a specified alignment to the top of a fasta file
+# Usage: python raise_aln.py -in <fasta file> -out <output file> -seq <sequence name>
+
+import os
+import sys
+import argparse
+
+def raise_seq(infile, outfile, seqn):
+	aligns = []
+	name = ""
+	seq = ""
+	if '>' not in seqn:
+		seqn = '>' + seqn
+	f = open(infile, 'r')
+	for line in f:
+		if line.startswith(">"):
+			if seq != "":
+				aligns.append((name, seq))
+			name = line.strip()
+			seq = ""
+		elif not line.startswith("#"):
+			seq = seq + line
+	aligns.append((name, seq))
+	f.close()
+
+	index = [x for x, y in enumerate(aligns) if y[0] == seqn] # locate the top sequence
+	if len(index)==0:
+		raise SystemExit(seqn + " cannot be located in " + infile)
+	else:
+		index = index[0]
+	out = open(outfile, 'w')
+	out.write(aligns[index][0] + '\n' + aligns[index][1].strip())
+	for a in aligns:
+		if a[0]!=seqn:
+			out.write('\n' + a[0] + '\n' + a[1].strip())
+	out.close()
+
+
+
+if __name__ == "__main__":
+	if len(sys.argv)<3:
+		raise SystemExit("Usage: python raise_fasta.py -in <fasta file> -seq <sequence name>\n\tOptional inputs: -out <output file>")
+
+	parser = argparse.ArgumentParser()
+
+	parser.add_argument('-in', '--fasta_file', nargs='?')
+	parser.add_argument('-out', '--out', nargs='?') # output file
+	parser.add_argument('-seq', '--seqname', nargs='?') # name of the sequence that should be moved to the top
+
+	parser.set_defaults(out = '_raised.fasta')
+	parsed = parser.parse_args()
+	infile = parsed.__dict__['fasta_file']
+	filename, file_extension = os.path.splitext(infile)
+	outfile = parsed.__dict__['out']
+	if outfile == '_raised.fasta':
+		outfile = filename + '_raised' + file_extension
+	seqnanme = '>' + parsed.__dict__['seqname']
+	raise_seq(infile, outfile, seqname)
diff --git a/scripts/tests/examples/a1.fa b/scripts/tests/examples/a1.fa
@@ -0,0 +1,8 @@
+>template
+DERE-T
+>t1
+CC-CCC
+>t2
+D-DDDD
+>t3
+EEE-EE
diff --git a/scripts/tests/examples/a2.fa b/scripts/tests/examples/a2.fa
@@ -0,0 +1,4 @@
+>t4
+FFF-FFF
+>template
+--DERET
diff --git a/scripts/tests/test_alis.py b/scripts/tests/test_alis.py
@@ -0,0 +1,70 @@
+import pytest
+import sys
+import os
+from pathlib import Path
+
+sys.path.append(str(Path(__file__).resolve().parents[1]))
+
+testsdir = str(Path(__file__).resolve().parents[1]) + '/tests/'
+
+@pytest.mark.mrgali
+@pytest.mark.parametrize('refs,alis,msa', [(['IGE','IGE','IGE'],
+                                                  [
+                                                      [
+                                                          'I-G-E',
+                                                          'IRGIS',
+                                                          'TT---'
+                                                      ], [
+                                                          'IGE',
+                                                          'TTT'
+                                                      ], [
+                                                          'I-G-E',
+                                                          'F-F--'
+                                                      ]
+                                                  ],
+                                                  ['0: I-G-E', '1: I-G-E', '2: I-G-E',
+                                                   '3: IRGIS', '4: TT---', '5: T-T-T', '6: F-F--']
+                                                  )])
+def test_pyali_Alignmnet(refs, alis, msa):
+    from pyali.mrgali import Alignment
+    a = Alignment.from_reference(refs)
+    a.merge(0, alis[0])
+    a.merge(1, alis[1])
+    a.merge(2, alis[2])
+    assert str(a).split('\n') == msa
+
+
+@pytest.mark.scripts
+@pytest.mark.parametrize('infile,outfile,refn', [(testsdir + 'examples/a2.fa', testsdir + 'examples/a2_tmp.fa','template')])
+def test_raise_seq(infile, outfile, refn):
+    from raise_aln import raise_seq
+    assert os.path.isfile(infile) == True, '%s does not exist.'%(infile)
+    raise_seq(infile, outfile, refn)
+    assert os.path.isfile(outfile), '%s does not exist.'%(outfile)
+    with open(outfile) as f:
+        content = f.read().splitlines()
+    assert content[0] == '>' + refn, 'The sequence of %s was not raised to the top of the alignments.' % (refn)
+    os.remove(outfile)
+
+@pytest.mark.scripts
+@pytest.mark.parametrize('file_list,outname,width,refn,result', [([testsdir + 'examples/a1.fa', testsdir + 'examples/a2.fa'],
+                                                           testsdir + 'examples/a1_a2_merged.fa', 80, 'template',
+                                                                         ['>template',
+                                                                         '--DERE-T',
+                                                                         '>t1',
+                                                                         '--CC-CCC',
+                                                                         '>t2',
+                                                                         '--D-DDDD',
+                                                                         '>t3',
+                                                                         '--EEE-EE',
+                                                                         '>t4',
+                                                                         'FFF-FF-F']
+                                                                  )])
+def test_align_merger(file_list, outname, width, refn, result):
+    from align_merger import align_merger
+    align_merger(file_list, outname, width, refn)
+    assert os.path.isfile(outname), 'No output was produced'
+    with open(outname) as f:
+        content = f.read().splitlines()
+    assert content == result
+
diff --git a/setup.py b/setup.py
@@ -1,7 +1,8 @@
-from distutils.core import setup
+from setuptools import setup
 setup(
   name = 'pyali',
   packages = ['pyali'], # this must be the same as the name above
+  python_requires = '>=3',
   install_requires = ['pandas', 'numpy', 'simplejson'],
   version = '0.1.1',
   description = 'A package for merging alignments',