Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions datamol/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,9 @@
"render_mol_df": "datamol.convert",
"to_inchi_non_standard": "datamol.convert",
"to_inchikey_non_standard": "datamol.convert",
"to_dict": "datamol.convert",
"from_dict": "datamol.convert",
"to_binary": "datamol.convert",
# fp
"to_fp": "datamol.fp",
"fp_to_array": "datamol.fp",
Expand Down Expand Up @@ -305,6 +308,9 @@ def __dir__():
from .convert import render_mol_df
from .convert import to_inchi_non_standard
from .convert import to_inchikey_non_standard
from .convert import to_dict
from .convert import from_dict
from .convert import to_binary

from .fp import to_fp
from .fp import fp_to_array
Expand Down
44 changes: 44 additions & 0 deletions datamol/convert.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
from typing import Dict
from typing import Union
from typing import List
from typing import Optional
from typing import cast
from typing import Sequence

import re
import json

from loguru import logger

import pandas as pd

from rdkit import Chem
from rdkit.Chem import rdmolfiles
from rdkit.Chem import rdMolInterchange
from rdkit.Chem import PandasTools

import selfies as sf
Expand Down Expand Up @@ -361,6 +364,22 @@ def from_smarts(smarts: Optional[str]) -> Optional[Mol]:
return Chem.MolFromSmarts(smarts) # type: ignore


def to_binary(mol: Mol) -> Optional[bytes]:
"""Convert a mol to a binary string.

Note that the molecular information to be stored in the binary string
is dependent on the RDKit pickling options.

Args:
mol: a molecule.
"""

if mol is None:
return None

return mol.ToBinary() # type: ignore


def to_df(
mols: Sequence[Mol],
smiles_column: Optional[str] = "smiles",
Expand Down Expand Up @@ -514,6 +533,31 @@ def render_mol_df(df: pd.DataFrame):
_ChangeMoleculeRendering(df)


def to_dict(mols: Sequence[Mol]) -> Dict:
"""Convert a list of mols to a dataframe using each mol properties
as a column.

For the reverse operation, you might to check `dm.from_df()`.

Args:
mols: a molecule.
"""

return json.loads(rdMolInterchange.MolsToJSON(mols))


def from_dict(mol_dict: Dict) -> List[Mol]:
"""Convert a dict to a list of mols.

For the reverse operation, you might to check `dm.to_dict()`.

Args:
mol_dict: a dict.
"""

return rdMolInterchange.JSONToMols(json.dumps(mol_dict))


def _ChangeMoleculeRendering(frame=None, renderer="PNG"):
"""Allows to change the rendering of the molecules between base64 PNG images and string
representations.
Expand Down
10 changes: 6 additions & 4 deletions datamol/mol.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def copy_mol(mol: Mol) -> Mol:


def to_mol(
mol: Union[str, Mol],
mol: Union[str, bytes, Mol],
add_hs: bool = False,
explicit_only: bool = False,
ordered: bool = False,
Expand All @@ -72,7 +72,7 @@ def to_mol(
"""Convert an input molecule (smiles representation) into a `Mol`.

Args:
mol: A SMILES or a molecule.
mol: A SMILES, a binary string from Mol.ToBinary(), or a molecule.
add_hs: Whether hydrogens should be added the molecule after the SMILES has been parsed.
explicit_only: Whether to only add explicit hydrogen or both
(implicit and explicit). when `add_hs` is set to True.
Expand All @@ -91,8 +91,8 @@ def to_mol(
None is returned so make sure that you handle this case on your own.
"""

if not isinstance(mol, (str, Mol)):
raise ValueError(f"Input should be a Mol or a string instead of '{type(mol)}'")
if not isinstance(mol, (str, bytes, Mol)):
raise ValueError(f"Input should be a Mol, a string, or bytes instead of '{type(mol)}'")

if isinstance(mol, str):
smiles_params = rdmolfiles.SmilesParserParams()
Expand All @@ -106,6 +106,8 @@ def to_mol(

if not sanitize and _mol is not None:
_mol.UpdatePropertyCache(False)
elif isinstance(mol, bytes):
_mol = Chem.Mol(mol)
else:
_mol = mol

Expand Down
51 changes: 51 additions & 0 deletions tests/test_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,29 @@ def test_inchi():
assert dm.from_inchi(None) is None


def test_to_binary(datadir):
smiles = "CC(=O)Oc1ccccc1C(=O)O"
mol = dm.to_mol(smiles)

binary_string = dm.to_binary(mol)
assert isinstance(binary_string, bytes)
new_mol = dm.to_mol(binary_string)
assert dm.same_mol(mol, new_mol)

data_path = datadir / "TUBB3-observations.sdf"
mols = dm.read_sdf(data_path)
mol = mols[0]

binary_string = dm.to_binary(mol)
assert isinstance(binary_string, bytes)
new_mol = dm.to_mol(binary_string)
assert dm.same_mol(mol, new_mol)
assert (
np.sum(np.abs(mol.GetConformer(0).GetPositions() - new_mol.GetConformer(0).GetPositions()))
< 1e-5
)


def test_to_df(datadir):
data_path = datadir / "TUBB3-observations.sdf"
mols = dm.read_sdf(data_path)
Expand Down Expand Up @@ -218,6 +241,34 @@ def test_to_df_smiles_warning(datadir, caplog):
)


def test_to_dict(datadir):
data_path = datadir / "TUBB3-observations.sdf"
mols = dm.read_sdf(data_path)
mols_dict = dm.to_dict(mols)

assert len(mols_dict["molecules"]) == 10
for mol_dict in mols_dict["molecules"]:
assert "conformers" in mol_dict
assert "properties" in mol_dict
assert len(mol_dict["properties"]) == 11


def test_from_dict(datadir):
data_path = datadir / "TUBB3-observations.sdf"
mols = dm.read_sdf(data_path)
mols_dict = dm.to_dict(mols)
new_mols = dm.from_dict(mols_dict)

for mol, new_mol in zip(mols, new_mols):
assert dm.same_mol(mol, new_mol)
assert (
np.sum(
np.abs(mol.GetConformer(0).GetPositions() - new_mol.GetConformer(0).GetPositions())
)
< 1e-5
)


def test_to_cxsmiles():
mol = dm.to_mol("OC1=CC2CCCCC2[N:1]=C1")
smiles = dm.to_smiles(mol, cxsmiles=True)
Expand Down
4 changes: 4 additions & 0 deletions tests/test_mol.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ def test_to_mol():
mol = dm.to_mol(smiles)
assert mol is None

binary_string = b"\xef\xbe\xad\xde\x00\x00\x00\x00\x10\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00\x06\x00\x00\x00\x80\x01\x06@h\x00\x00\x00\x03\x03\x01\x06@h\x00\x00\x00\x03\x03\x01\x06@h\x00\x00\x00\x03\x03\x01\x06@h\x00\x00\x00\x03\x03\x01\x06@h\x00\x00\x00\x03\x03\x01\x06@h\x00\x00\x00\x03\x03\x01\x0b\x00\x01h\x0c\x01\x02h\x0c\x02\x03h\x0c\x03\x04h\x0c\x04\x05h\x0c\x05\x00h\x0cB\x01\x00\x00\x00\x06\x00\x05\x04\x03\x02\x01\x17\x04\x00\x00\x00\x00\x00\x00\x00\x16"
mol = dm.to_mol(binary_string)
assert mol.GetNumAtoms() == 6


def test_reorder_atoms():
smiles = "c1ccc(C(=O)O)c(c1)OC(=O)C"
Expand Down