Skip to content

Topology.from_pdb silently mangles input if atom order in PDB splits up molecule #2093

@j-wags

Description

@j-wags

Describe the bug

If a PDB file has its lines go in the following order:

(protein1)
(protein2)
(protein3)
CONECT protein1 protein3

then something goes wrong with the indexing and the loaded topology is mangled (that is, the bonds/elements/coordinates are wrong).

This should raise an error rather than returning a mangled topology.

To Reproduce

min_repro2.pdb.tar.gz

from openff.toolkit import Topology
import numpy as np
from openff.units import unit

bad_top = Topology.from_pdb('min_repro2.pdb')


def check_violations(topology):
    n_violations = 0
    for mol in topology.molecules:
        for bond in mol.bonds:
            length = np.linalg.norm(mol.conformers[0][bond.atom1_index] - mol.conformers[0][bond.atom2_index])
            if not(0.4 < length.m_as(unit.angstrom) < 2.5):
                print(length.m_as(unit.angstrom))
                print(bond.atom1_index, bond.atom1)
                print(bond.atom1.metadata)
                print(bond.atom2_index, bond.atom2)
                print(bond.atom2.metadata)
                print()
                n_violations += 1
    return n_violations

n_violations = check_violations(bad_top)
print(f"Found {n_violations} violations")

outputs

2.924166376935485
34 <Atom name='OG1' atomic number='1'>
{'residue_name': 'THR', 'residue_number': '2', 'insertion_code': ' ', 'chain_id': 'A', 'match_info': '{"1": ["CYS", 11], "15": ["PEPTIDE_BOND", 4]}'}
28 <Atom name='N' atomic number='7'>
{'residue_name': 'THR', 'residue_number': '2', 'insertion_code': ' ', 'chain_id': 'A', 'match_info': '{"1": ["CYS", 0], "3": ["CYS", 0], "15": ["PEPTIDE_BOND", 3]}'}

4.447098604708468
30 <Atom name='C' atomic number='6'>
{'residue_name': 'THR', 'residue_number': '2', 'insertion_code': ' ', 'chain_id': 'A', 'match_info': '{"1": ["CYS", 2], "3": ["CYS", 2], "16": ["PEPTIDE_BOND", 1]}'}
38 <Atom name='HG1' atomic number='7'>
{'residue_name': 'THR', 'residue_number': '2', 'insertion_code': ' ', 'chain_id': 'A', 'match_info': '{"10": ["NME", 1], "16": ["PEPTIDE_BOND", 3]}'}

4.489425798473564
39 <Atom name='HG21' atomic number='6'>
{'residue_name': 'THR', 'residue_number': '2', 'insertion_code': ' ', 'chain_id': 'A', 'match_info': '{"10": ["NME", 2], "16": ["PEPTIDE_BOND", 5]}'}
42 <Atom name='N' atomic number='1'>
{'residue_name': 'NME', 'residue_number': '3', 'insertion_code': ' ', 'chain_id': 'A', 'match_info': '{"10": ["NME", 4]}'}

5.479940145658528
39 <Atom name='HG21' atomic number='6'>
{'residue_name': 'THR', 'residue_number': '2', 'insertion_code': ' ', 'chain_id': 'A', 'match_info': '{"10": ["NME", 2], "16": ["PEPTIDE_BOND", 5]}'}
43 <Atom name='C' atomic number='1'>
{'residue_name': 'NME', 'residue_number': '3', 'insertion_code': ' ', 'chain_id': 'A', 'match_info': '{"10": ["NME", 5]}'}

3.4491478657778663
40 <Atom name='HG22' atomic number='1'>
{'residue_name': 'THR', 'residue_number': '2', 'insertion_code': ' ', 'chain_id': 'A', 'match_info': '{"10": ["NME", 6], "16": ["PEPTIDE_BOND", 4]}'}
38 <Atom name='HG1' atomic number='7'>
{'residue_name': 'THR', 'residue_number': '2', 'insertion_code': ' ', 'chain_id': 'A', 'match_info': '{"10": ["NME", 1], "16": ["PEPTIDE_BOND", 3]}'}

6.219911976869127
33 <Atom name='CG2' atomic number='16'>
{'residue_name': 'THR', 'residue_number': '2', 'insertion_code': ' ', 'chain_id': 'A', 'match_info': '{"1": ["CYS", 6], "3": ["CYS", 6], "17": ["DISULFIDE", 2]}'}
11 <Atom name='SG' atomic number='16'>
{'residue_name': 'CYS', 'residue_number': '2', 'insertion_code': ' ', 'chain_id': ' ', 'match_info': '{"0": ["CYS", 6], "2": ["CYS", 6], "17": ["DISULFIDE", 1]}'}

2.9372880349056665
0 <Atom name='H' atomic number='6'>
{'residue_name': 'NME', 'residue_number': '3', 'insertion_code': ' ', 'chain_id': 'A', 'match_info': '{"6": ["ACE", 1], "13": ["PEPTIDE_BOND", 1]}'}
2 <Atom name='H2' atomic number='6'>
{'residue_name': 'NME', 'residue_number': '3', 'insertion_code': ' ', 'chain_id': 'A', 'match_info': '{"6": ["ACE", 3]}'}

2.7258301487803656
0 <Atom name='H' atomic number='6'>
{'residue_name': 'NME', 'residue_number': '3', 'insertion_code': ' ', 'chain_id': 'A', 'match_info': '{"6": ["ACE", 1], "13": ["PEPTIDE_BOND", 1]}'}
1 <Atom name='H1' atomic number='8'>
{'residue_name': 'NME', 'residue_number': '3', 'insertion_code': ' ', 'chain_id': 'A', 'match_info': '{"6": ["ACE", 2], "13": ["PEPTIDE_BOND", 2]}'}

9.997196507021354
2 <Atom name='H2' atomic number='6'>
{'residue_name': 'NME', 'residue_number': '3', 'insertion_code': ' ', 'chain_id': 'A', 'match_info': '{"6": ["ACE", 3]}'}
4 <Atom name='C' atomic number='1'>
{'residue_name': 'ACE', 'residue_number': '1', 'insertion_code': ' ', 'chain_id': 'B', 'match_info': '{"6": ["ACE", 5]}'}

9.552955825293028
2 <Atom name='H2' atomic number='6'>
{'residue_name': 'NME', 'residue_number': '3', 'insertion_code': ' ', 'chain_id': 'A', 'match_info': '{"6": ["ACE", 3]}'}
5 <Atom name='O' atomic number='1'>
{'residue_name': 'ACE', 'residue_number': '1', 'insertion_code': ' ', 'chain_id': 'B', 'match_info': '{"6": ["ACE", 6]}'}

11.644704934003268
0 <Atom name='H' atomic number='6'>
{'residue_name': 'NME', 'residue_number': '3', 'insertion_code': ' ', 'chain_id': 'A', 'match_info': '{"6": ["ACE", 1], "13": ["PEPTIDE_BOND", 1]}'}
6 <Atom name='CH3' atomic number='7'>
{'residue_name': 'ACE', 'residue_number': '1', 'insertion_code': ' ', 'chain_id': 'B', 'match_info': '{"4": ["THR", 0], "13": ["PEPTIDE_BOND", 3]}'}

3.1266034286426523
7 <Atom name='H1' atomic number='6'>
{'residue_name': 'ACE', 'residue_number': '1', 'insertion_code': ' ', 'chain_id': 'B', 'match_info': '{"4": ["THR", 1], "13": ["PEPTIDE_BOND", 5]}'}
10 <Atom name='N' atomic number='6'>
{'residue_name': 'CYS', 'residue_number': '2', 'insertion_code': ' ', 'chain_id': 'B', 'match_info': '{"4": ["THR", 5]}'}

5.348462302381874
7 <Atom name='H1' atomic number='6'>
{'residue_name': 'ACE', 'residue_number': '1', 'insertion_code': ' ', 'chain_id': 'B', 'match_info': '{"4": ["THR", 1], "13": ["PEPTIDE_BOND", 5]}'}
14 <Atom name='CB' atomic number='1'>
{'residue_name': 'CYS', 'residue_number': '2', 'insertion_code': ' ', 'chain_id': 'B', 'match_info': '{"4": ["THR", 13]}'}

4.0524119978106885
10 <Atom name='N' atomic number='6'>
{'residue_name': 'CYS', 'residue_number': '2', 'insertion_code': ' ', 'chain_id': 'B', 'match_info': '{"4": ["THR", 5]}'}
15 <Atom name='SG' atomic number='1'>
{'residue_name': 'CYS', 'residue_number': '2', 'insertion_code': ' ', 'chain_id': 'B', 'match_info': '{"4": ["THR", 12]}'}

5.069885501665694
13 <Atom name='O' atomic number='1'>
{'residue_name': 'CYS', 'residue_number': '2', 'insertion_code': ' ', 'chain_id': 'B', 'match_info': '{"4": ["THR", 14], "13": ["PEPTIDE_BOND", 4]}'}
6 <Atom name='CH3' atomic number='7'>
{'residue_name': 'ACE', 'residue_number': '1', 'insertion_code': ' ', 'chain_id': 'B', 'match_info': '{"4": ["THR", 0], "13": ["PEPTIDE_BOND", 3]}'}

2.68545415153564
16 <Atom name='H' atomic number='1'>
{'residue_name': 'CYS', 'residue_number': '2', 'insertion_code': ' ', 'chain_id': 'B', 'match_info': '{"4": ["THR", 7]}'}
12 <Atom name='C' atomic number='8'>
{'residue_name': 'CYS', 'residue_number': '2', 'insertion_code': ' ', 'chain_id': 'B', 'match_info': '{"4": ["THR", 6]}'}

5.820392512537275
8 <Atom name='H2' atomic number='6'>
{'residue_name': 'ACE', 'residue_number': '1', 'insertion_code': ' ', 'chain_id': 'B', 'match_info': '{"4": ["THR", 2], "14": ["PEPTIDE_BOND", 1]}'}
20 <Atom name='N' atomic number='7'>
{'residue_name': 'NME', 'residue_number': '3', 'insertion_code': ' ', 'chain_id': 'B', 'match_info': '{"9": ["NME", 1], "14": ["PEPTIDE_BOND", 3]}'}

Found 17 violations

Metadata

Metadata

Assignees

Labels

No labels
No labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions