haddock3/src/haddock/libs/libaa2cg.py at b98db551b2ceb6851cea7a050fde2c5548bd5444 · haddocking/haddock3 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#  CODE TAKEN FROM MARTINIZE 1.1 ##
# Please refer to it for more information
"""
Reduces complexity of protein residue to the MARTINI coarse grained model:
CA, O, Bead(s) in specific atom location.

Reference:
Monticelli et al. The MARTINI coarse-grained force field: extension to proteins.
J. Chem. Theory Comput. (2008) vol. 4 (5) pp. 819-834

Martinize Script from Tserk Wassenaar

Uses Biopython to parse the structure and DSSP output.
Uses pieces of the martinize-1.1.py script to convert the SS types

Outputs a coarse grained pdb file (*_ss.pdb) with assigned bfactors.
Outputs a tbl file to map the beads to the atoms they represent.

Updates
 - Updated python version to 2.6 to support isdisjoint() set method in DSSP.py (JR Apr 2012)
 - Residues that DSSP can't handle (incomplete backbone f ex) treated as coil  (JR Apr 2012)
 - Update to_one_letter_code library to protein_letters_3to1 (Jorge Roel 2017)
 - Inclusion of fake beads for corresponding amino-acids <SCd> (Jorge Roel 2017)
 - Changed the mapping routine to include DNA bead types (Rodrigo Honorato 2018)
 - Implemented feature to check if nucleic acid is a candidate for hbond (Rodrigo Honorato 2018)
"""

import collections
import itertools
import math
import os
import random
import subprocess
import tempfile
import warnings

from pathlib import Path
from io import StringIO

from Bio.PDB import Entity, PDBIO, PDBParser
from Bio.PDB.Structure import Structure
from Bio.PDB.StructureBuilder import StructureBuilder

from haddock import log
from haddock.core.exceptions import ModuleError
from haddock.core.typing import Optional
from haddock.libs.libontology import Format

warnings.filterwarnings("ignore")

CRYST_LINE = "CRYST1 " + os.linesep

def norm(a):
    """

    Args:
        a:

    Returns:

    """
    return math.sqrt(norm2(a))


def norm2(a):
    """

    Args:
        a:

    Returns:

    """
    return sum([i * i for i in a])


def pat(x, c="."):
    """
    Reformats pattern strings.

    Args:
        x:
        c:

    Returns:

    """
    return x.replace(c, "\x00").split()


def hash(x, y):
    """
    Makes a dictionary from two lists.

    Args:
        x:
        y:

    Returns:

    """
    return dict(zip(x, y))


def spl(x):
    """
    Splits a string.

    Args:
        x:

    Returns:

    """
    return x.split()


def tt(program):
    """

    Args:
        program:

    Returns:

    """
    return "".join([ssd[program].get(chr(i), "C") for i in range(256)])


def typesub(seq, patterns, types):
    """
    Pattern substitutions.

    Args:
        seq:
        patterns:
        types:

    Returns:

    """
    for i, j in zip(patterns, types):
        seq = seq.replace(i, j)
    return seq


def ss_classification(ss, program="dssp"):
    """
    Translates a string encoding the secondary structure to a string of corresponding Martini types, taking the
    origin of the secondary structure into account, and replacing termini if requested.

    Args:
        ss:
        program:

    Returns:

    """
    # Translate dssp/pymol/gmx ss to Martini ss
    ss = ss.translate(sstt[program])
    # Separate the different secondary structure types
    sep = dict([(i, ss.translate(sstd[i])) for i in sstd.keys()])
    # Do type substitutions based on patterns
    # If the ss type is not in the patterns lists, do not substitute
    # (use empty lists for substitutions)

    typ = [typesub(sep[i], patterns.get(i, []), pattypes.get(i, []))
           for i in sstd.keys()]
    # Translate all types to numerical values
    typ = [[ord(j) for j in list(i)] for i in typ]
    # Sum characters back to get a full typed sequence
    typ = "".join([chr(sum(i)) for i in zip(*typ)])
    # Return both the actual as well as the fully typed sequence
    return ss, typ

# ----+--------------------------------------+
#   A | SECONDARY STRUCTURE TYPE DEFINITIONS |
# ----+--------------------------------------+


ssdefs = {
    "dssp": list(".HGIBETSC~"),  # DSSP one letter secondary structure code     #@#
    "pymol": list(".H...S...L"),  # Pymol one letter secondary structure code    #@#
    "gmx": list(".H...ETS.C"),  # Gromacs secondary structure dump code        #@#
    "self": list("FHHHEETSCC")  # Internal CG secondary structure codes        #@#
}
cgss = list("FHHHEETSCC")  # Corresponding CG secondary structure types   #@#

patterns = {
    "H": pat(".H. .HH. .HHH. .HHHH. .HHHHH. .HHHHHH. .HHHHHHH. .HHHH HHHH.")  # @#
}
pattypes = {
    "H": pat(".3. .33. .333. .3333. .13332. .113322. .1113222. .1111 2222.")  # @#
}

ss_to_code = {"C": 1,  # Free,
              " ": 1,
              "S": 2,
              "H": 3,
              "1": 4,
              "2": 5,
              "3": 6,
              "E": 7,  # Extended
              "T": 8,  # Turn
              "F": 9  # Fibril
              }

ss_eq = list("CBHHHHBTF")

# List of programs for which secondary structure definitions can be processed
programs = ssdefs.keys()

# Dictionaries mapping ss types to the CG ss types
ssd = dict([(i, hash(ssdefs[i], cgss)) for i in programs])

# The translation table depends on the program used to obtain the
# secondary structure definitions
sstt = dict([(i, tt(i)) for i in programs])

# The following translation tables are used to identify stretches of
# a certain type of secondary structure.
null = "\x00"
sstd = dict([(i, ord(i) * null + i + (255 - ord(i)) * null) for i in cgss])

# ==========================================================================================#
# ==========================================================================================#
# ==========================================================================================#

# CG MAPPING INFORMATION

bb = "CA C N O "
prot_atoms = {"ALA": [bb + "CB"],
              "CYS": [bb, "CB SG"],
              "ASP": [bb, "CB CG OD1 OD2"],
              "GLU": [bb, "CB CG CD OE1 OE2"],
              "PHE": [bb, "CB CG CD1", "CD2 CE2", "CE1 CZ"],
              "GLY": [bb],
              "HIS": [bb, "CB CG", "CD2 NE2", "ND1 CE1"],
              "ILE": [bb, "CB CG1 CG2 CD1"],
              "LYS": [bb, "CB CG CD", "CE NZ"],
              "LEU": [bb, "CB CG CD1 CD2"],
              "MET": [bb, "CB CG SD CE"],
              "ASN": [bb, "CB CG ND1 ND2 OD1 OD2"],  # ND1?
              "PRO": [bb, "CB CG CD"],
              "GLN": [bb, "CB CG CD OE1 OE2 NE1 NE2"],
              "ARG": [bb, "CB CG CD", "NE CZ NH1 NH2"],
              "SER": [bb, "CB OG"],
              "THR": [bb, "CB OG1 CG2"],
              "VAL": [bb, "CB CG1 CG2"],
              "TRP": [bb, "CB CG CD2", "CD1 NE1 CE2", "CE3 CZ3", "CZ2 CH2"],
              "TYR": [bb, "CB CG CD1", "CD2 CE2", "CE1 CZ OH"]}

bead_names = ["BB", "SC1", "SC2", "SC3", "SC4"]

# insert beads into the data structure
cg_mapping = {}
for res in prot_atoms:
    cg_mapping[res] = collections.OrderedDict()
    for i, atom_l in enumerate(prot_atoms[res]):
        bead = bead_names[i]
        cg_mapping[res][atom_l] = bead

######################################
# Nucleotide mapping,
# This is a custom naming convention
# but the atom mapping is defined in
# 10.1021/acs.jctc.5b00286 -  S1
######################################

DA_beads = collections.OrderedDict()
DA_beads["O3'* P O1P O2P O5' OP1 OP2"] = "BB1"
DA_beads["C5' O4' C4'"] = "BB2"
DA_beads["C3' C2' C1'"] = "BB3"
DA_beads["N9 C4"] = "SC1"
DA_beads["C2 N3"] = "SC2"
DA_beads["C6 N6 N1"] = "SC3"
DA_beads["C8 N7 C5"] = "SC4"

DC_beads = collections.OrderedDict()
DC_beads["O3'* P O1P O2P O5' OP1 OP2"] = "BB1"
DC_beads["C5' O4' C4'"] = "BB2"
DC_beads["C3' C2' C1'"] = "BB3"
DC_beads["N1 C6"] = "SC1"
DC_beads["N3 C2 O2"] = "SC2"
DC_beads["C5 C4 N4"] = "SC3"

DG_beads = collections.OrderedDict()
DG_beads["O3'* P O1P O2P O5' OP1 OP2"] = "BB1"
DG_beads["C5' O4' C4'"] = "BB2"
DG_beads["C3' C2' C1'"] = "BB3"
DG_beads["N9 C4"] = "SC1"
DG_beads["C2 N2 N3"] = "SC2"
DG_beads["C6 O6 N1"] = "SC3"
DG_beads["C8 N7 C5"] = "SC4"

DT_beads = collections.OrderedDict()
DT_beads["O3'* P O1P O2P O5' OP1 OP2"] = "BB1"
DT_beads["C5' O4' C4'"] = "BB2"
DT_beads["C3' C2' C1'"] = "BB3"
DT_beads["N1 C6"] = "SC1"
DT_beads["N3 C2 O2"] = "SC2"
DT_beads["C5 C4 O4 C7"] = "SC3"

A_beads = collections.OrderedDict()
A_beads["O3'* P O1P O2P O5' OP1 OP2"] = "BB1"
A_beads["C5' O4' C4'"] = "BB2"
A_beads["C3' C2' O2' C1'"] = "BB3"
A_beads["N9 C4"] = "SC1"
A_beads["C2 N3"] = "SC2"
A_beads["C6 N6 N1"] = "SC3"
A_beads["C8 N7 C5"] = "SC4"

C_beads = collections.OrderedDict()
C_beads["O3'* P O1P O2P O5' OP1 OP2"] = "BB1"
C_beads["C5' O4' C4'"] = "BB2"
C_beads["C3' C2' O2' C1'"] = "BB3"
C_beads["N1 C6"] = "SC1"
C_beads["N3 C2 O2"] = "SC2"
C_beads["C5 C4 N4"] = "SC3"

G_beads = collections.OrderedDict()
G_beads["O3'* P O1P O2P O5' OP1 OP2"] = "BB1"
G_beads["C5' O4' C4'"] = "BB2"
G_beads["C3' C2' O2' C1'"] = "BB3"
G_beads["N9 C4"] = "SC1"
G_beads["C2 N2 N3"] = "SC2"
G_beads["C6 O6 N1"] = "SC3"
G_beads["C8 N7 C5"] = "SC4"

U_beads = collections.OrderedDict()
U_beads["O3'* P O1P O2P O5' OP1 OP2"] = "BB1"
U_beads["C5' O4' C4'"] = "BB2"
U_beads["C3' C2' O2' C1'"] = "BB3"
U_beads["N1 C6"] = "SC1"
U_beads["N3 C2 O2"] = "SC2"
U_beads["C5 C4 O4"] = "SC3"

cg_mapping["DA"] = DA_beads
cg_mapping["DC"] = DC_beads
cg_mapping["DT"] = DT_beads
cg_mapping["DG"] = DG_beads

cg_mapping["A"] = A_beads
cg_mapping["C"] = C_beads
cg_mapping["U"] = U_beads
cg_mapping["G"] = G_beads

pairing = {
    ("DG", "DC"): [("N2", "O2"), ("N1", "N3"), ("O6", "N4")],
    ("DC", "DG"): [("O2", "N2"), ("N3", "N1"), ("N4", "O6")],
    ("DA", "DT"): [("N6", "O4"), ("N1", "N3")],
    ("DT", "DA"): [("O4", "N6"), ("N3", "N1")],
    #
    ("G", "C"): [("N2", "O2"), ("N1", "N3"), ("O6", "N4")],
    ("C", "G"): [("O2", "N2"), ("N3", "N1"), ("N4", "O6")],
    ("A", "U"): [("N6", "O4"), ("N1", "N3")],
    ("U", "A"): [("O4", "N6"), ("N3", "N1")],

}

polar = ["GLN", "ASN", "SER", "THR"]
charged = ["ARG", "LYS", "ASP", "GLU"]


def add_dummy(bead_list, dist=0.11, n=2):
    """

    Args:
        bead_list:
        dist:
        n:

    Returns:

    """
    new_bead_dic = {}

    # Generate a random vector in a sphere of -1 to +1, to add to the bead position
    v = [random.random() * 2. - 1, random.random() * 2. - 1, random.random() * 2. - 1]

    # Calculated the length of the vector and divide by the final distance of the dummy bead
    norm_v = norm(v) / dist

    # Resize the vector
    vn = [i / norm_v for i in v]

    # m sets the direction of the added vector, currently only works when adding one or two beads.
    m = 1
    for j in range(n):  # create two new beads
        bead_s = str(j + 1)
        new_name = f"SCD{bead_s}"  # set the name of the new bead
        new_bead_dic[new_name] = [i + (m * j) for i, j in zip(bead_list[-1][1], vn)]
        m *= -2
    return new_bead_dic


def map_cg(chain):
    """

    Args:
        chain:

    Returns:

    """
    m_dic = collections.OrderedDict()

    for aares in chain:
        m_dic[aares] = collections.OrderedDict()

        resn = aares.resname.split()[0]  # resname
        segid = aares.segid.strip()
        resi = aares.id[1]

        if resn not in cg_mapping:
            log.warning(f"Residue {resn} not in cg_mapping — skipping.")
            continue  # skip to next residue

        # for each atom segment, calculate its center of mass and map the correct bead
        for atom_segment in cg_mapping[resn]:
            atoms = [aares[a] for a in atom_segment.split() if a in aares.child_dict]

            if atoms:
                if "*" in atom_segment:  # this is important to correctly place CG DNA beads

                    # this * means it belongs to the previous residue... find it!
                    target_previous_atom_list = [a for a in atom_segment.split() if "*" in a]

                    for target_atom in target_previous_atom_list:
                        # does it exist?
                        target_atom_name = target_atom.split("*")[0]
                        try:
                            previous_atom = chain[resi - 1][target_atom_name]
                            # how far away the previous atom is from this atom segment?
                            #  if it is too far away this could be the next chain...!
                            minimum_dist = min([(a - previous_atom) for a in atoms])
                            if minimum_dist < 2.0:  # 2.0 A is very permissive
                                atoms.append(previous_atom)
                        except KeyError:
                            # previous atom not found, move on
                            pass

            if not atoms:
                log.warning('Residue {} {:d} of chain {} cannot be processed: missing atoms {} '.
                                 format(resn, resi, aares.parent.id, atom_segment))
                continue

            bead_name = cg_mapping[resn][atom_segment]

            # get center of mass
            code = list(set([a.bfactor for a in aares if a.bfactor != 0]))

            if len(code) > 1:
                emsg = "Something is wrong with HADDOCK codes"
                raise ModuleError(emsg)

            if not code:
                code = 0.0
            else:
                code = code[0]

            bead_coord = center_of_mass(atoms)

            atom_segment = " ".join(list(atom_segment.split())).replace("*", "")

            # restrain for backmapping
            restrain = "assign (segid {}CG and resid {:d} and name {})"\
                " (segid {} and resid {:d} and (name {})) 0 0 0".\
                format(segid, resi, bead_name, segid, resi, " or name ".join(atom_segment.split()))

            m_dic[aares][bead_name] = bead_coord, code, restrain

    # add dummy beads whenever its needed
    for r in m_dic:

        if r.resname in polar:
            d = 0.14  # distance
            n = 2  # number of dummy beads to be placed

        elif r.resname in charged:
            d = 0.11  # distance
            n = 1  # number of dummy beads to be placed

        else:
            continue

        # add to data structure
        # this special beads have no HADDOCK code
        bead_list = [(b, m_dic[r][b][0]) for b in m_dic[r]]
        dummy_bead_dic = add_dummy(bead_list, dist=d, n=n)
        for db in dummy_bead_dic:
            db_coords = dummy_bead_dic[db]
            # code should be the same as the residue
            code = m_dic[r][list(m_dic[r])[0]][1]

            m_dic[r][db] = (db_coords, code, None)

    return m_dic


def center_of_mass(entity, geometric=False):
    """
    Returns gravitic [default] or geometric center of mass of an Entity.
    Geometric assumes all masses are equal (geometric=True)

    Args:
        entity:
        geometric:

    Returns:

    """
    # Structure, Model, Chain, Residue
    if isinstance(entity, Entity.Entity):
        atom_list = entity.get_atoms()
    # List of Atoms
    elif hasattr(entity, "__iter__") and [x for x in entity if x.level == "A"]:
        atom_list = entity
    else:  # Some other weirdo object
        raise ValueError("Center of Mass can only be calculated from the following objects:\n"
                         "Structure, Model, Chain, Residue, list of Atoms.")

    masses = []
    positions = [[], [], []]  # [ [X1, X2, ..] , [Y1, Y2, ...] , [Z1, Z2, ...] ]

    for atom in atom_list:
        masses.append(atom.mass)

        for i, coord in enumerate(atom.coord.tolist()):
            positions[i].append(coord)

    # If there is a single atom with undefined mass complain loudly.
    if "ukn" in set(masses) and not geometric:
        raise ValueError(f"Some Atoms don't have an element assigned.{os.linesep}"
                         "Try adding them manually or calculate the geometrical "
                         "center of mass instead.")

    if geometric:
        return [sum(coord_list) / len(masses) for coord_list in positions]

    w_pos = [[], [], []]
    for atom_index, atom_mass in enumerate(masses):
        w_pos[0].append(positions[0][atom_index] * atom_mass)
        w_pos[1].append(positions[1][atom_index] * atom_mass)
        w_pos[2].append(positions[2][atom_index] * atom_mass)

    return [sum(coord_list) / sum(masses) for coord_list in w_pos]


def determine_hbonds(structure: Structure):
    """

    Args:
        structure:

    Returns:

    """
    nuc = ["DA", "DC", "DG", "DT", "A", "C", "G", "U"]
    aa = ["ALA", "CYS", "ASP", "GLU", "PHE",
          "GLY", "HIS", "ILE", "LYS", "LEU",
          "MET", "ASN", "PRO", "GLN", "ARG",
          "SER", "THR", "VAL", "TRP", "TYR"]

    pair_list = []
    for model in structure:

        dna_chain_l = []

        for chain in model:

            prot_comp = len([r for r in chain.get_residues() if r.resname.split()[0] in aa])
            dna_comp = len([r for r in chain.get_residues() if r.resname.split()[0] in nuc])

            if prot_comp:
                # protein
                pass

            if dna_comp:
                # nucleic
                dna_chain_l.append(chain)

        if len(dna_chain_l) == 1:
            log.warning('Only one DNA/RNA chain detected, is this correct?')

            chain_a = dna_chain_l[0]
            reslist_a = [r for r in chain_a.get_residues()]

            for ra, rb in itertools.combinations(reslist_a, 2):
                pair = identify_pairing(ra, rb)
                if pair:
                    pair_list.append(pair)

        if len(dna_chain_l) > 1:  # list sizes could be different, this might be improbable
            for chain_a, chain_b in itertools.combinations(dna_chain_l, 2):
                reslist_a = [r for r in chain_a.get_residues()]
                reslist_b = [r for r in chain_b.get_residues()]
                for ra in reslist_a:
                    for rb in reslist_b:
                        pair = identify_pairing(ra, rb)
                        pair_list.append(pair)
    return pair_list


def identify_pairing(ra, rb):
    """

    Args:
        ra:
        rb:

    Returns:

    """
    pair = []

    # check if the pairing is correct
    ra_name = ra.resname.split()[0]
    rb_name = rb.resname.split()[0]

    try:
        atom_pair_list = pairing[ra_name, rb_name]
    except KeyError:
        # pairing not possible
        return

    # check if distances are ok
    distance_l = []

    for atom_list in atom_pair_list:

        try:
            a = ra[atom_list[0]]
            b = rb[atom_list[1]]
            distance_l.append(a - b)
        except KeyError:
            # residue does not have the necessary sidechain atoms
            #  assume its not a pair
            return

    # check P-P distances to make sure its the opposite base
    # distances for perfect DNA:
    #  opposite = 18.8A
    #  sequential = 6.6A
    #
    # 10.0A should be sufficient
    p_cutoff = 10.0

    # Basedist_cutoff = 3.5
    basedist_cutoff = 3.5
    try:
        pa = ra.child_dict["P"]
        pb = rb.child_dict["P"]
        p_distance = pa - pb
    except KeyError:
        # some base is missing its P, use the geometric center instead
        cen_a = center_of_mass(ra.child_dict.values())
        cen_b = center_of_mass(rb.child_dict.values())
        p_distance = math.sqrt(sum([(a - b) ** 2 for a, b in zip(cen_a, cen_b)]))

    if p_distance > p_cutoff:
        resnum_a = ra.id[1]
        resnum_b = rb.id[1]

        if all(e < basedist_cutoff for e in distance_l):  # if ALL bonds are within range
            # KEEP IN MIND THAT this will not account for badly paired DNA
            # Implement a way to search for the closest possible pair? ##
            segid_a = ra.get_segid().split()[0]
            segid_b = rb.get_segid().split()[0]
            pair = (resnum_a, segid_a), (resnum_b, segid_b)

            # special atoms, mark them!
            for atom_pair in atom_pair_list:
                atom_a, atom_b = atom_pair

                ra[atom_a].bfactor = 1
                rb[atom_b].bfactor = 1
    return pair


def output_cg_restraints(pair_list):
    """

    Args:
        pair_list:

    Returns:

    """
    out = open("dna_restraints.def", "w")
    for i, e in enumerate(pair_list):
        idx = i + 1
        res_a = e[0][0]
        segid_a = e[0][1]
        res_b = e[1][0]
        segid_b = e[1][1]
        out.write(f"{{===>}} base_a_{idx}=(resid {res_a} and segid {segid_a});\n"
                  f"{{===>}} base_b_{idx}=(resid {res_b} and segid {segid_b});\n\n")
    out.close()


def extract_groups(pair_list):
    """

    Args:
        pair_list:

    Returns:

    """
    # this will be used to define AA restraints
    out = open("dna-aa_groups.dat", "w")
    # extract groups
    group_a = [a[0][0] for a in pair_list]
    segid_a = list(set([a[0][1] for a in pair_list]))

    group_b = [a[1][0] for a in pair_list]
    segid_b = list(set([a[0][1] for a in pair_list]))

    if len(segid_a) != 1:
        emsg = "Something is wrong with SEGID A"
        raise ModuleError(emsg)

    if len(segid_b) != 1:
        emsg = "Something is wrong with SEGID B"
        raise ModuleError(emsg)

    segid_a = segid_a[0]
    segid_b = segid_b[0]

    group_a.sort()
    group_b.sort()
    out.write(f"{group_a[0]}:{group_a[-1]}\n{segid_a}\n{group_b[0]}:{group_b[-1]}\n{segid_b}")
    out.close()


def create_file_with_cryst(pdb_file: str) -> None:
    """
    This function creates a new pdb because the CRYST line is missing from the pdf file.
    This line is necessary for DSSP.

    Args:
        output: str
        pdb_file: str

    Returns:
        pdb_file_copy: str

    """
    with open(pdb_file, "r") as file_in,\
    tempfile.NamedTemporaryFile(mode = "w", delete=False) as file_out:
        content = file_in.read()
        file_out.write(CRYST_LINE + content)

    return file_out.name


def determine_ss(structure: Structure, skipss: bool, pdbf_path: str) -> Structure:
    """Determine secondary structures from input structure

    Args:
        structure:
        skipss:
        pdbf_path:

    Returns:
        structure:

    """
    # calculate SS
    for model in structure:

        if skipss:
            continue
        try:
            tmp_file_name = create_file_with_cryst(pdbf_path)
            p = subprocess.Popen(["dssp", tmp_file_name, "--output-format", "dssp"],
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE,
                                 text=True)
            dssp_raw, _ = p.communicate()
            dssp_raw = dssp_raw.split('#')[1].split('\n')[1:-1]
        except:  # TODO: think about making this exception more specific
            # no secondary structure detected for this model
            log.warning('SS could not be assigned, assigning code 1 to all residues')
            continue
        finally:
            if Path(tmp_file_name).exists(): Path(tmp_file_name).unlink()

        dssp = {}

        for line in dssp_raw:
            var_a, var_b = line[11], line[16]
            dssp.setdefault(var_a, []).append(var_b)

        # Get SS information and translate it:
        # DSSP > MARTINI > HADDOCK
        # this could still be improved
        for chain in model:
            dssp_ss = "".join(dssp[chain.id])
            _, martini_types = ss_classification(dssp_ss)  # ancestral function, keep it there

            # transform MARTINI > HADDOCK
            # and add it to the bfactor col
            for residue, ss in zip(chain, martini_types): # chain and martini_types order must match
                code = ss_to_code[ss]
                # for atom in residue.get_atoms():
                for atom in residue:
                    atom.bfactor = code
    return structure


def rename_nucbases(structure: Structure) -> None:
    """Inplace residue renaming according to HADDOCK ones.

    Parameters
    ----------
    structure : Bio.PDB.Structure.Structure
        Input structure
    """
    chainresdic = {
        c.get_id(): [r.get_resname() for r in c.get_residues()]
        for m in structure
        for c in m
    }

    nucleotide_list = ["CYT", "C", "DC", "THY", "T", "DT", "ADE",
                       "A", "DA", "G", "GUA", "DG", "U", "URI"]
    rna_resname_mapper = {"CYT": "C", "URI": "U", "ADE": "A", "GUA": "G"}
    dna_rename_mapper = {"CYT": "DC", "THY": "DT", "ADE": "DA", "GUA": "DG"}

    if [True for c in chainresdic for e in chainresdic[c] if e in nucleotide_list]:
        # Check if this is an RNA
        is_rna = [True for c in chainresdic for e in chainresdic[c] if e in ["U", "URI"]]
        ref_dic = rna_resname_mapper if is_rna else dna_rename_mapper
        # Loop over models
        for model in structure:
            for chain in model:
                for r in chain.get_residues():
                    if r.resname in ref_dic.keys():
                        # Rename residue name
                        r.resname = ref_dic[r.resname]


def martinize(
        input_pdb: str,
        output_path: str,
        skipss: bool,
    ) -> tuple[str, bool]:
    """
    Converts an all-atom (AA) PDB structure into a coarse-grained (CG) model
    using a MARTINI2.2 mapping and generating CG-to-AA restraints for backmapping.
    Optionally uses secondary structure for mapping.

    Args:
        input_pdb (str):
            Path to the input AA PDB file.
        output_path (str):
            Directory where the output files will be written.
                - A CG PDB file (*_cg.pdb)
                - A restraint table (*_cg_to_aa.tbl)
        skipss (bool):
            If True, skips secondary structure assignment (DSSP step).
            If False, assigns secondary structure and encodes it
            into HADDOCK-compatible B-factors.

    Returns:
        tuple[str, bool]:
            cg_pdb_name: Path to the generated CG PDB file.
            shape: True if at least one residue with name "SHA" (shape bead)
            is detected in the structure, False otherwise.
    """

    if not input_pdb:
        emsg = "No input file detected"
        raise ModuleError(emsg)

    p = PDBParser()
    io = PDBIO()

    # Parse PDB and run DSSP
    pdbf_path = os.path.realpath(input_pdb)
    aa_model = p.get_structure("aa_model", pdbf_path)

    # set ALL bfactors to 1
    for model in aa_model:
        for chain in model:
            if chain.id == " ":
                emsg = "Empty chain id detected"
                raise ModuleError(emsg)

            for residue in chain:
                for atom in residue:
                    atom.bfactor = 1.0

    # Assign HADDOCK code according to SS (1-9)
    determine_ss(structure=aa_model, skipss=skipss, pdbf_path=pdbf_path)

    # Strandardize naming
    # WARNING, THIS ASSUMES THAT INPUT DNA/RNA IS 3-LETTER CODE
    rename_nucbases(aa_model)

    # Assign HADDOCK code for hydrogen bonding capable nucleotides (0-1)
    pair_list = determine_hbonds(aa_model)
    if pair_list:
        output_cg_restraints(pair_list)

    # Map CG beads to AA structure
    structure_builder = StructureBuilder()
    structure_builder.init_structure("cg_model")
    structure_builder.init_seg(" ")  # Empty SEGID

    tbl_cg_to_aa = []
    restrain_counter = 0
    for model in aa_model:

        structure_builder.init_model(model.id)

        for chain in model:

            structure_builder.init_chain(chain.id)
            structure_builder.init_seg(chain.id)

            mapping_dic = map_cg(chain)

            for residue in mapping_dic:
                if residue.id[0] != " ":  # filter HETATMS
                    continue

                structure_builder.init_residue(residue.resname, residue.id[0],
                                               residue.id[1], residue.id[2])

                for i, bead in enumerate(mapping_dic[residue]):

                    bead_name = bead
                    bead_coord = mapping_dic[residue][bead_name][0]
                    haddock_code = mapping_dic[residue][bead_name][1]
                    restrain = mapping_dic[residue][bead_name][2]

                    structure_builder.init_atom(
                        bead_name,
                        bead_coord,
                        haddock_code,
                        1.00,
                        " ",
                        bead_name,
                        i)

                    tbl_cg_to_aa.append(restrain)
                    restrain_counter += 1

    cg_model = structure_builder.get_structure()

    # Write pre-CG structure
    io.set_structure(cg_model)
    # Setup in-memory text buffer
    io_file = StringIO()
    # Write file in it
    io.save(io_file, write_end=1)
    # Go back to the start of the file to read it
    io_file.seek(0)

    # Write the actual valid CG structure
    # make sure atom names are in the correct place
    # .BB. .BB1. .BB2. and not BB.. BB1.. BB2..
    cg_pdb_name = gen_cg_filename(f"../{output_path}", pdbf_path)
    out = open(cg_pdb_name, "w")
    for line in io_file.readlines():
        if line.startswith("ATOM"):
            atom_name = line[12:16].split()[0]
            # mind the spacing
            if 1 <= len(atom_name) <= 3:
                n_l = f"{line[:12]} {atom_name:<3s}{line[16:]}"
            else:
                n_l = line
        else:
            n_l = line
        out.write(n_l)
    out.close()
    del io_file

    # Write CG to AA backmapping restraint file
    tbl_file_name = gen_cg_tbl_backmapping_fname(f"../{output_path}", pdbf_path)
    with open(tbl_file_name, "w") as tbl_file:
        tbl_file.write("\n" + "\n".join([tbl for tbl in tbl_cg_to_aa if tbl]))

    return cg_pdb_name


def gen_cg_filename(
        output_dir: str,
        input_fname: str,
        force_field: Optional[str] = None,
        ext: Optional[str] = None,
        ) -> str:
    """Helper function to standarize CG filename from input file.

    Parameters
    ----------
    output_dir : str