Skip to content

Commit 2bd6c74

Browse files
authored
Merge pull request #1521 from haddocking/enhance-filename-generation-cg
Add helper functions for CG filename generation
2 parents 3fd590e + 3812fdf commit 2bd6c74

2 files changed

Lines changed: 139 additions & 60 deletions

File tree

src/haddock/libs/libaa2cg.py

Lines changed: 109 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -25,23 +25,26 @@
2525
- Implemented feature to check if nucleic acid is a candidate for hbond (Rodrigo Honorato 2018)
2626
"""
2727

28+
import collections
2829
import itertools
30+
import math
2931
import os
3032
import random
3133
import subprocess
34+
import tempfile
3235
import warnings
36+
3337
from pathlib import Path
34-
from haddock import log
35-
import tempfile
36-
import collections
37-
import math
38+
from io import StringIO
3839

39-
from Bio.PDB import Entity
40-
from Bio.PDB import PDBIO
41-
from Bio.PDB import PDBParser
40+
from Bio.PDB import Entity, PDBIO, PDBParser
41+
from Bio.PDB.Structure import Structure
4242
from Bio.PDB.StructureBuilder import StructureBuilder
4343

44+
from haddock import log
4445
from haddock.core.exceptions import ModuleError
46+
from haddock.core.typing import Optional
47+
from haddock.libs.libontology import Format
4548

4649
warnings.filterwarnings("ignore")
4750

@@ -143,8 +146,8 @@ def typesub(seq, patterns, types):
143146

144147
def ss_classification(ss, program="dssp"):
145148
"""
146-
Translates a string encoding the secondary structure to a string of corresponding Martini types, taking the
147-
origin of the secondary structure into account, and replacing termini if requested.
149+
Translates a string encoding the secondary structure to a string of corresponding Martini types, taking the
150+
origin of the secondary structure into account, and replacing termini if requested.
148151
149152
Args:
150153
ss:
@@ -544,7 +547,7 @@ def center_of_mass(entity, geometric=False):
544547
return [sum(coord_list) / sum(masses) for coord_list in w_pos]
545548

546549

547-
def determine_hbonds(structure):
550+
def determine_hbonds(structure: Structure):
548551
"""
549552
550553
Args:
@@ -752,8 +755,8 @@ def create_file_with_cryst(pdb_file: str) -> None:
752755
return file_out.name
753756

754757

755-
def determine_ss(structure, skipss, pdbf_path):
756-
"""
758+
def determine_ss(structure: Structure, skipss: bool, pdbf_path: str) -> Structure:
759+
"""Determine secondary structures from input structure
757760
758761
Args:
759762
structure:
@@ -807,39 +810,43 @@ def determine_ss(structure, skipss, pdbf_path):
807810
return structure
808811

809812

810-
def rename_nucbases(structure):
811-
"""
812-
813-
Args:
814-
structure:
815-
816-
Returns:
813+
def rename_nucbases(structure: Structure) -> None:
814+
"""Inplace residue renaming according to HADDOCK ones.
817815
816+
Parameters
817+
----------
818+
structure : Bio.PDB.Structure.Structure
819+
Input structure
818820
"""
819-
chainresdic = dict([(c.get_id(),
820-
[r.get_resname() for r in c.get_residues()]) for m in structure for c in m])
821+
chainresdic = {
822+
c.get_id(): [r.get_resname() for r in c.get_residues()]
823+
for m in structure
824+
for c in m
825+
}
821826

822827
nucleotide_list = ["CYT", "C", "DC", "THY", "T", "DT", "ADE",
823828
"A", "DA", "G", "GUA", "DG", "U", "URI"]
829+
rna_resname_mapper = {"CYT": "C", "URI": "U", "ADE": "A", "GUA": "G"}
830+
dna_rename_mapper = {"CYT": "DC", "THY": "DT", "ADE": "DA", "GUA": "DG"}
824831

825832
if [True for c in chainresdic for e in chainresdic[c] if e in nucleotide_list]:
826-
827-
if [True for c in chainresdic for e in chainresdic[c] if e in ["U", "URI"]]:
828-
# CG needs 1 letter for RNA
829-
ref_dic = {"CYT": "C", "URI": "U", "ADE": "A", "GUA": "G"}
830-
else:
831-
# CG needs 2 letters for DNA
832-
ref_dic = {"CYT": "DC", "THY": "DT", "ADE": "DA", "GUA": "DG"}
833-
833+
# Check if this is an RNA
834+
is_rna = [True for c in chainresdic for e in chainresdic[c] if e in ["U", "URI"]]
835+
ref_dic = rna_resname_mapper if is_rna else dna_rename_mapper
836+
# Loop over models
834837
for model in structure:
835838
for chain in model:
836839
for r in chain.get_residues():
837840
if r.resname in ref_dic.keys():
838-
# rename
841+
# Rename residue name
839842
r.resname = ref_dic[r.resname]
840843

841844

842-
def martinize(input_pdb: str, output_path: str, skipss: bool):
845+
def martinize(
846+
input_pdb: str,
847+
output_path: str,
848+
skipss: bool,
849+
) -> tuple[str, bool]:
843850
"""
844851
Converts an all-atom (AA) PDB structure into a coarse-grained (CG) model
845852
using a MARTINI2.2 mapping and generating CG-to-AA restraints for backmapping.
@@ -944,15 +951,21 @@ def martinize(input_pdb: str, output_path: str, skipss: bool):
944951

945952
cg_model = structure_builder.get_structure()
946953

947-
# Write CG structure
948-
cg_pdb_name = f"{output_path}/{Path(pdbf_path).stem}_cg.pdb"
954+
# Write pre-CG structure
949955
io.set_structure(cg_model)
950-
io.save("temp.pdb", write_end=1)
951-
956+
# Setup in-memory text buffer
957+
io_file = StringIO()
958+
# Write file in it
959+
io.save(io_file, write_end=1)
960+
# Go back to the start of the file to read it
961+
io_file.seek(0)
962+
963+
# Write the actual valid CG structure
952964
# make sure atom names are in the correct place
953965
# .BB. .BB1. .BB2. and not BB.. BB1.. BB2..
966+
cg_pdb_name = gen_cg_filename(f"../{output_path}", pdbf_path)
954967
out = open(cg_pdb_name, "w")
955-
for line in open("temp.pdb", "r"):
968+
for line in io_file.readlines():
956969
if line.startswith("ATOM"):
957970
atom_name = line[12:16].split()[0]
958971
# mind the spacing
@@ -964,14 +977,67 @@ def martinize(input_pdb: str, output_path: str, skipss: bool):
964977
n_l = line
965978
out.write(n_l)
966979
out.close()
967-
Path("temp.pdb").unlink(missing_ok=True)
980+
del io_file
968981

969-
# Write Restraints
970-
tbl_file_name = f"{output_path}/{Path(pdbf_path).stem}_cg_to_aa.tbl"
971-
tbl_file = open(tbl_file_name, "w")
972-
tbl_str = "\n".join([tbl for tbl in tbl_cg_to_aa if tbl])
973-
tbl_file.write(f"\n{tbl_str}")
974-
tbl_file.close()
982+
# Write CG to AA backmapping restraint file
983+
tbl_file_name = gen_cg_tbl_backmapping_fname(f"../{output_path}", pdbf_path)
984+
with open(tbl_file_name, "w") as tbl_file:
985+
tbl_file.write("\n" + "\n".join([tbl for tbl in tbl_cg_to_aa if tbl]))
975986

976987
return cg_pdb_name
977988

989+
990+
def gen_cg_filename(
991+
output_dir: str,
992+
input_fname: str,
993+
force_field: Optional[str] = None,
994+
ext: Optional[str] = None,
995+
) -> str:
996+
"""Helper function to standarize CG filename from input file.
997+
998+
Parameters
999+
----------
1000+
output_dir : str
1001+
Where to write the file.
1002+
input_fname : str
1003+
Name of the original input PDB file.
1004+
force_field : Optional[str], optional
1005+
Name of the force-field, by default None
1006+
ext : Optional[str], optional
1007+
File extension, by default None
1008+
1009+
Returns
1010+
-------
1011+
cg_fname : str
1012+
Name of the CG file.
1013+
"""
1014+
# Suffix for force-field if defined
1015+
ff_suffix = f"_{force_field}" if force_field else ""
1016+
# Set file extension
1017+
file_ext = ext if ext else Format.PDB
1018+
# Generate filepath
1019+
cg_fpath = Path(
1020+
output_dir,
1021+
f"{Path(input_fname).stem}_cg{ff_suffix}.{file_ext}"
1022+
)
1023+
cg_fname = str(cg_fpath)
1024+
return cg_fname
1025+
1026+
1027+
def gen_cg_tbl_backmapping_fname(output_dir: str, input_fname: str) -> Path:
1028+
"""Helper function to generate CG backmapping retraints filename.
1029+
1030+
Parameters
1031+
----------
1032+
output_dir : str
1033+
Where to write the file.
1034+
input_fname : str
1035+
Name of the original input PDB file.
1036+
1037+
Returns
1038+
-------
1039+
tbl_file_name: Path
1040+
Name of backmapping restraint filename.
1041+
"""
1042+
tbl_file_name = Path(output_dir, f"{Path(input_fname).stem}_cg_to_aa.tbl")
1043+
return tbl_file_name

src/haddock/modules/topology/topocg/__init__.py

Lines changed: 30 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,17 @@
2828
prepare_output,
2929
prepare_single_input,
3030
)
31+
from haddock.libs.libaa2cg import (
32+
martinize,
33+
gen_cg_filename,
34+
gen_cg_tbl_backmapping_fname,
35+
)
3136
from haddock.libs.libontology import Format, PDBFile, TopologyFile
3237
from haddock.libs.libstructure import make_molecules
3338
from haddock.libs.libsubprocess import CNSJob
3439
from haddock.modules import get_engine
3540
from haddock.modules.base_cns_module import BaseCNSModule
3641

37-
from haddock.libs.libaa2cg import martinize
38-
3942

4043
RECIPE_PATH = Path(__file__).resolve().parent
4144
DEFAULT_CONFIG = Path(RECIPE_PATH, MODULE_DEFAULT_YAML)
@@ -235,9 +238,9 @@ def _run(self) -> None:
235238

236239
# Get psf files for aa topology
237240
psf_files[i] = [
238-
Path(mol.as_posix()[:-4]+".psf")
239-
for mol in splited_models
240-
]
241+
Path(mol.parent, f"{mol.stem}.{Format.TOPOLOGY}")
242+
for mol in splited_models
243+
]
241244

242245
# get the MD5 hash of each model
243246
ens_dic[i] = [
@@ -316,29 +319,39 @@ def _run(self) -> None:
316319
md5_hash = None
317320
else:
318321
md5_hash = md5_dic[j]
319-
origin_name_model = origin_names[j]
322+
origin_name_model = model.stem
320323

324+
# FIXME: _model_id not used ?
321325
try:
322-
model_id = int(model.stem.split("_")[-2])
323-
origin_name_model = ("_").join(model.stem.split("_"))
326+
_model_id = int(model.stem.split("_")[-2])
324327
except ValueError:
325-
model_id = 0
326-
origin_name_model = str(model.stem).split(".pdb")[0]
327-
328+
_model_id = 0
329+
328330
shape_mod = shape_dic[i]
329331
if not shape_mod:
330-
processed_pdb = Path(f"{origin_name_model}_cg_{force_field}.{Format.PDB}")
331-
processed_topology = Path(f"{origin_name_model}_cg_{force_field}.{Format.TOPOLOGY}")
332-
processed_cgtoaa_tbl=Path(
333-
self.path.resolve().parent, f"{origin_name_model}_cg_to_aa.tbl").resolve()
332+
processed_pdb = gen_cg_filename(
333+
"",
334+
origin_name_model,
335+
force_field=force_field,
336+
)
337+
processed_topology = gen_cg_filename(
338+
"",
339+
origin_name_model,
340+
force_field=force_field,
341+
ext=Format.TOPOLOGY,
342+
)
343+
processed_cgtoaa_tbl = gen_cg_tbl_backmapping_fname(
344+
self.path.resolve().parent,
345+
origin_name_model,
346+
)
334347
else:
335348
processed_pdb = Path(f"{origin_name_model}.{Format.PDB}")
336349
processed_topology = Path(f"{origin_name_model}.{Format.TOPOLOGY}")
337-
processed_cgtoaa_tbl=None
350+
processed_cgtoaa_tbl = None
338351

339352
topology = TopologyFile(processed_topology, path=".")
340353
psf_file_uniq = psf_files[i][j].as_posix().split('/')
341-
aa_topo_path = psf_file_uniq[0] + "/" + psf_file_uniq[1]
354+
aa_topo_path = f"{psf_file_uniq[0]}/{psf_file_uniq[1]}"
342355
aa_topology = TopologyFile(psf_file_uniq[2], path=aa_topo_path)
343356
pdb = PDBFile(
344357
file_name=processed_pdb,

0 commit comments

Comments
 (0)