Skip to content

Atom Typing

Provides utility functions for assigning atom types to each atom in a molecule. Atom types are defined in the SmartsPatternRegistry.

Warning

As of the current version of the library, these functions are not used in the main code. They are kept for comparison and testing with earlier versions of the library.

assign_atom_types

assign_atom_types(mol, atomgroup)

Assign atom types to each atom in the molecule. Atom types are defined in SmartsPatternRegistry.

Parameters:

Name Type Description Default
mol MolType

The molecule for which to assign atom types.

required
atomgroup AtomGroupType

The atomgroup for which to assign atom types.

required

Returns:

Type Description
NDArray[int8]

NDArray[np.int8]: An array of shape (n_atoms, n_atom_types) where each element is either 0 or 1.

Source code in lahuta/utils/atom_types.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
def assign_atom_types(mol: MolType, atomgroup: AtomGroupType) -> NDArray[np.int8]:
    """Assign atom types to each atom in the molecule. Atom types are defined in `SmartsPatternRegistry`.

    Args:
        mol (MolType): The molecule for which to assign atom types.
        atomgroup (AtomGroupType): The atomgroup for which to assign atom types.

    Returns:
        NDArray[np.int8]: An array of shape (n_atoms, n_atom_types) where each element is either 0 or 1.
    """
    atypes = AVAILABLE_ATOM_TYPES

    atypes_array = np.zeros((mol.NumAtoms(), len(atypes)), dtype=np.int8)
    for atom_type in SmartsPatternRegistry:
        smartsdict = SmartsPatternRegistry[atom_type.name].value
        for smarts in smartsdict.values():
            ob_smart: ObSmartPatternType = OBSmartsPatternWrapper(ob.OBSmartsPattern())
            ob_smart.Init(str(smarts))
            ob_smart.Match(mol)

            matches = [x[0] for x in ob_smart.GetMapList()]
            for match in matches:
                atom = mol.GetAtom(match)

                atypes_array[atom.GetId(), atypes[atom_type.name]] = 1

    # ALL WATER MOLECULES ARE HYDROGEN BOND DONORS AND ACCEPTORS
    for atom in atomgroup.select_atoms("resname SOL HOH TIP3 TIP4 WAT W and not name H*"):
        atypes_array[atom.index, atypes["hbond_acceptor"]] = 1
        atypes_array[atom.index, atypes["hbond_donor"]] = 1

    # OVERRIDE PROTEIN ATOM TYPING FROM DICTIONARY
    for residue in atomgroup.select_atoms("resname " + " ".join(STANDARD_AMINO_ACIDS)).residues:
        for atom in residue.atoms:
            # REMOVE TYPES IF ALREADY ASSIGNED FROM SMARTS
            for prot_atype in list(PROT_ATOM_TYPES.keys()):
                atypes_array[atom.index, atypes[prot_atype]] = 0

            # ADD ATOM TYPES FROM DICTIONARY
            for prot_atype, atom_ids in PROT_ATOM_TYPES.items():
                atom_id = residue.resname.strip() + atom.name.strip()
                if atom_id in atom_ids:
                    atypes_array[atom.index, atypes[prot_atype]] = 1

    return atypes_array

vec_assign_atom_types

vec_assign_atom_types(mol, atomgroup, ta)

Assign atom types to each atom in the molecule. Atom types are defined in SmartsPatternRegistry.

Parameters:

Name Type Description Default
mol MolType

The molecule for which to assign atom types.

required
atomgroup AtomGroupType

The atomgroup for which to assign atom types.

required
ta dict[str, NDArray[str_]]

A dictionary containing the atom names and residue names.

required

Returns:

Type Description
NDArray[int8]

NDArray[np.int8]: An array of shape (n_atoms, n_atom_types) where each element is either 0 or 1.

Source code in lahuta/utils/atom_types.py
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
def vec_assign_atom_types(
    mol: MolType,
    atomgroup: AtomGroupType,
    ta: dict[str, NDArray[np.str_]],
) -> NDArray[np.int8]:
    """Assign atom types to each atom in the molecule. Atom types are defined in `SmartsPatternRegistry`.

    Args:
        mol (MolType): The molecule for which to assign atom types.
        atomgroup (AtomGroupType): The atomgroup for which to assign atom types.
        ta (dict[str, NDArray[np.str_]]): A dictionary containing the atom names and residue names.

    Returns:
        NDArray[np.int8]: An array of shape (n_atoms, n_atom_types) where each element is either 0 or 1.

    """
    atypes = {x: i for i, x in enumerate(list(PROT_ATOM_TYPES.keys()))}

    atypes_array = np.zeros((mol.NumAtoms(), len(atypes)), dtype=np.int8)
    for atom_type in SmartsPatternRegistry:
        smartsdict = SmartsPatternRegistry[atom_type.name].value
        for smarts in smartsdict.values():
            ob_smart: ObSmartPatternType = OBSmartsPatternWrapper(ob.OBSmartsPattern())
            ob_smart.Init(str(smarts))
            ob_smart.Match(mol)

            matches = [x[0] for x in ob_smart.GetMapList()]
            for match in matches:
                atom = mol.GetAtom(match)

                if atom.GetResidue().GetName() not in STANDARD_AMINO_ACIDS:
                    atypes_array[atom.GetId(), atypes[atom_type.name]] = 1

    # ALL WATER MOLECULES ARE HYDROGEN BOND DONORS AND ACCEPTORS
    for atom in atomgroup.select_atoms("resname SOL HOH TIP3 TIP4 WAT W and not name H*"):
        atypes_array[atom.index, atypes["hbond_acceptor"]] = 1
        atypes_array[atom.index, atypes["hbond_donor"]] = 1

    # OVERRIDE PROTEIN ATOM TYPING FROM DICTIONARY
    resname, atom_name = ta["resname"], ta["name"]

    # Convert atoms to NumPy arrays for efficient indexing
    ag = atomgroup.select_atoms("resname " + " ".join(STANDARD_AMINO_ACIDS)).atoms
    resindices = np.array([atom.resindex for atom in atomgroup])
    indices = np.array([atom.index for atom in atomgroup])

    # Convert arrays to string type
    resname_str = resname[resindices].astype(str)
    atom_name_str = atom_name[indices].astype(str)

    # Generate atom_id array by concatenating resname and atom_name arrays
    atom_ids: NDArray[np.str_] = np.core.defchararray.add(
        np.core.defchararray.strip(resname_str),
        np.core.defchararray.strip(atom_name_str),
    )

    for idx, atom in enumerate(ag):
        atom_id = atom_ids[idx]
        atom_types = ID_TO_TYPES.get(atom_id, None)

        if atom_types is None:
            continue

        for atom_type_x in atom_types:
            atypes_array[atom.index, atypes[atom_type_x]] = 1

    return atypes_array