Skip to content

Commit e735c2b

Browse files
Speed up identical molecule detection (#2025)
* speed up molecule comparison (should help with #2008) * update docstring and releasehistory * remove trailing whitespace * Apply some suggestions from code review * Apply suggestions from code review Co-authored-by: Josh A. Mitchell <[email protected]> * fix whitespace --------- Co-authored-by: Josh A. Mitchell <[email protected]>
1 parent f4fff54 commit e735c2b

File tree

2 files changed

+34
-22
lines changed

2 files changed

+34
-22
lines changed

docs/releasehistory.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ Releases follow the `major.minor.micro` scheme recommended by [PEP440](https://w
1313
### Behavior changes
1414

1515
- [PR #2026](https://github.com/openforcefield/openff-toolkit/pull/2026): Makes `Molecule.__repr__` more succinct for large molecules.
16+
- [PR #2025](https://github.com/openforcefield/openff-toolkit/pull/2025): Speeds up `Molecule.ordered_connection_table_hash`, but changes the specific hash outputted for a given molecule. The meaning of hash identity within a single OpenFF Toolkit version is unchanged. Updates documentation to state that these this method is only intended for comparing Molecule objects using the same version of the OpenFF Toolkit, and that hashes may not be stable between versions.
1617
- [PR #2041](https://github.com/openforcefield/openff-toolkit/pull/2041): Drop testing on Python 3.10
1718

1819
### Bugfixes

openff/toolkit/topology/molecule.py

Lines changed: 33 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
2626
"""
2727

28+
import hashlib
2829
import json
2930
import operator
3031
import pathlib
@@ -750,11 +751,11 @@ def atom2(self):
750751

751752
@property
752753
def atom1_index(self) -> int:
753-
return self.molecule.atoms.index(self._atom1)
754+
return self._atom1.molecule_atom_index
754755

755756
@property
756757
def atom2_index(self) -> int:
757-
return self.molecule.atoms.index(self._atom2)
758+
return self._atom2.molecule_atom_index
758759

759760
@property
760761
def atoms(self):
@@ -1255,17 +1256,28 @@ def __hash__(self):
12551256
return hash(self.to_smiles())
12561257

12571258
def ordered_connection_table_hash(self) -> int:
1258-
"""Compute an ordered hash of the atoms and bonds in the molecule"""
1259+
"""
1260+
Compute an ordered hash of the atoms and bonds in the molecule.
1261+
1262+
This hash method is intended for comparison of Molecule objects at
1263+
runtime, and hashes from one version of the software should not be
1264+
compared with hashes generated using different versions.
1265+
"""
12591266
if self._ordered_connection_table_hash is not None:
12601267
return self._ordered_connection_table_hash
12611268

1269+
# Pre-assign molecule atom indices in O(N) time to avoid use of List.index to get index of each one
1270+
# in O(N^2) time
1271+
for index, atom in enumerate(self.atoms):
1272+
atom._molecule_atom_index = index
1273+
12621274
id = ""
12631275
for atom in self.atoms:
1264-
id += f"{atom.symbol}_{atom.formal_charge}_{atom.stereochemistry}__"
1276+
id += f"{atom.atomic_number}_{atom.formal_charge.magnitude}_{atom.stereochemistry}__"
12651277
for bond in self.bonds:
12661278
id += f"{bond.bond_order}_{bond.stereochemistry}_{bond.atom1_index}_{bond.atom2_index}__"
12671279

1268-
self._ordered_connection_table_hash = hash(id)
1280+
self._ordered_connection_table_hash = hashlib.sha3_224(id.encode("utf-8")).hexdigest()
12691281
return self._ordered_connection_table_hash
12701282

12711283
@classmethod
@@ -1971,23 +1983,22 @@ def from_smiles(
19711983
return molecule
19721984

19731985
def _is_exactly_the_same_as(self, other):
1974-
for atom1, atom2 in zip(self.atoms, other.atoms):
1975-
if (
1976-
(atom1.atomic_number != atom2.atomic_number)
1977-
or (atom1.formal_charge != atom2.formal_charge)
1978-
or (atom1.is_aromatic != atom2.is_aromatic)
1979-
or (atom1.stereochemistry != atom2.stereochemistry)
1980-
):
1981-
return False
1982-
for bond1, bond2 in zip(self.bonds, other.bonds):
1983-
if (
1984-
(bond1.atom1_index != bond2.atom1_index)
1985-
or (bond1.atom2_index != bond2.atom2_index)
1986-
or (bond1.is_aromatic != bond2.is_aromatic)
1987-
or (bond1.stereochemistry != bond2.stereochemistry)
1988-
):
1989-
return False
1990-
return True
1986+
# Pre-assign molecule atom indices in O(N) time to avoid use of List.index to get index of each one
1987+
# in O(N^2) time
1988+
for index, atom in enumerate(self.atoms):
1989+
atom._molecule_atom_index = index
1990+
for index, atom in enumerate(other.atoms):
1991+
atom._molecule_atom_index = index
1992+
1993+
self_id = (
1994+
tuple((atom.atomic_number, atom.formal_charge.magnitude, atom.stereochemistry) for atom in self.atoms),
1995+
tuple((bond.bond_order, bond.stereochemistry, bond.atom1_index, bond.atom2_index) for bond in self.bonds),
1996+
)
1997+
other_id = (
1998+
tuple((atom.atomic_number, atom.formal_charge.magnitude, atom.stereochemistry) for atom in other.atoms),
1999+
tuple((bond.bond_order, bond.stereochemistry, bond.atom1_index, bond.atom2_index) for bond in other.bonds),
2000+
)
2001+
return self_id == other_id
19912002

19922003
@staticmethod
19932004
def are_isomorphic(

0 commit comments

Comments
 (0)