Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(fmt/pdb): don't guess in reader #437

Merged
merged 9 commits into from
Dec 18, 2024
13 changes: 10 additions & 3 deletions include/nuri/algo/guess.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
#ifndef NURI_ALGO_GUESS_H_
#define NURI_ALGO_GUESS_H_

#include <absl/base/attributes.h>

#include "nuri/core/molecule.h"

namespace nuri {
Expand All @@ -26,8 +28,9 @@ constexpr double kDefaultThreshold = 0.5;
* If connectivity information is already present and is correct, consider using
* guess_all_types().
*/
extern bool guess_everything(MoleculeMutator &mut, int conf = 0,
double threshold = kDefaultThreshold);
ABSL_MUST_USE_RESULT extern bool
guess_everything(MoleculeMutator &mut, int conf = 0,
double threshold = kDefaultThreshold);

/**
* @brief Guess connectivity information of a molecule.
Expand All @@ -53,7 +56,7 @@ extern void guess_connectivity(MoleculeMutator &mut, int conf = 0,
* and all atom/bond types and implicit hydrogen counts are incorrect. The
* information present in the molecule could be overwritten by this function.
*/
extern bool guess_all_types(Molecule &mol, int conf = 0);
ABSL_MUST_USE_RESULT extern bool guess_all_types(Molecule &mol, int conf = 0);

/**
* @brief Guess formal charges of a molecule.
Expand Down Expand Up @@ -84,6 +87,10 @@ extern void guess_hydrogens_2d(Molecule &mol);
* guess_hydrogens() in sequence.
*/
extern void guess_fcharge_hydrogens_2d(Molecule &mol);

namespace internal {
ABSL_MUST_USE_RESULT extern bool guess_update_subs(Molecule &mol);
} // namespace internal
} // namespace nuri

#endif /* NURI_ALGO_GUESS_H_ */
2 changes: 1 addition & 1 deletion include/nuri/fmt/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ class MoleculeReader {
*/
virtual Molecule parse(const std::vector<std::string> &block) const = 0;

virtual bool sanitized() const = 0;
virtual bool bond_valid() const = 0;

MoleculeStream<MoleculeReader> stream() { return { *this }; }
};
Expand Down
2 changes: 1 addition & 1 deletion include/nuri/fmt/mol2.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class Mol2Reader: public DefaultReaderImpl<read_mol2> {

bool getnext(std::vector<std::string> &block) override;

bool sanitized() const override { return false; }
bool bond_valid() const override { return true; }

private:
bool read_mol_header_ = false;
Expand Down
2 changes: 1 addition & 1 deletion include/nuri/fmt/pdb.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class PDBReader: public DefaultReaderImpl<read_pdb> {

bool getnext(std::vector<std::string> &block) override;

bool sanitized() const override { return true; }
bool bond_valid() const override { return false; }

private:
std::vector<std::string> header_;
Expand Down
2 changes: 1 addition & 1 deletion include/nuri/fmt/sdf.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class SDFReader: public DefaultReaderImpl<read_sdf> {

bool getnext(std::vector<std::string> &block) override;

bool sanitized() const override { return false; }
bool bond_valid() const override { return true; }
};

class SDFReaderFactory: public DefaultReaderFactoryImpl<SDFReader> {
Expand Down
2 changes: 1 addition & 1 deletion include/nuri/fmt/smiles.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class SmilesReader: public DefaultReaderImpl<read_smiles> {

bool getnext(std::vector<std::string> &block) override;

bool sanitized() const override { return false; }
bool bond_valid() const override { return true; }
};

class SmilesReaderFactory: public DefaultReaderFactoryImpl<SmilesReader> {
Expand Down
9 changes: 5 additions & 4 deletions python/docs/conf.py.in
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
# For the full list of built-in configuration values, see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html

import doctest
import re

import doctest
from sphinx.ext import autodoc

# -- Project information -----------------------------------------------------
Expand Down Expand Up @@ -61,10 +61,10 @@ html_static_path = ["@CMAKE_CURRENT_LIST_DIR@/_static"]
if "@DOXYGEN_OUTPUT_DIR@":
html_extra_path = ["@DOXYGEN_OUTPUT_DIR@/html"]
html_css_files = [
"css/rtd-property.css" # workaround https://github.com/readthedocs/sphinx_rtd_theme/issues/1301
"css/rtd-property.css" # workaround https://github.com/readthedocs/sphinx_rtd_theme/issues/1301
]
html_theme_options = {
'navigation_depth': -1,
"navigation_depth": -1,
}

intersphinx_mapping = {
Expand All @@ -83,7 +83,8 @@ doctest_global_setup = "import nuri"
class UnneededBaseStripDocumenter(autodoc.ClassDocumenter):
_strip_re = re.compile(
r"\s*Bases:\s*:py:class:"
r"`([^`]*?\bpybind11_[A-Za-z0-9_\.]+|object)`\s*")
r"`([^`]*?\bpybind11_[A-Za-z0-9_\.]+|object)`\s*"
)

def add_line(self, line: str, source: str, *lineno: int):
if self._strip_re.fullmatch(line):
Expand Down
32 changes: 23 additions & 9 deletions python/src/nuri/fmt/fmt_module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <pybind11/pytypes.h>
#include <pybind11/stl/filesystem.h>

#include "nuri/algo/guess.h"
#include "nuri/core/molecule.h"
#include "nuri/fmt/base.h"
#include "nuri/fmt/mol2.h"
Expand All @@ -38,7 +39,8 @@
public:
PyMoleculeReader(std::unique_ptr<std::istream> is, std::string_view fmt,
bool sanitize, bool skip_on_error)
: stream_(std::move(is)), skip_on_error_(skip_on_error) {
: stream_(std::move(is)), sanitize_(sanitize),
skip_on_error_(skip_on_error) {
if (!*stream_)
throw py::value_error(absl::StrCat("Invalid stream object"));

Expand All @@ -51,7 +53,7 @@
if (!reader_)
throw py::value_error(absl::StrCat("Failed to create reader for ", fmt));

sanitize_ = sanitize && !reader_->sanitized();
guess_ = sanitize && !reader_->bond_valid();
}

auto next() {
Expand All @@ -78,7 +80,16 @@
continue;
}

if (sanitize_ && !MoleculeSanitizer(mol).sanitize_all()) {
if (guess_ && mol.is_3d()) {
if (!internal::guess_update_subs(mol)) {
log_or_throw("Failed to guess molecule atom/bond types");
continue;

Check warning on line 86 in python/src/nuri/fmt/fmt_module.cpp

View check run for this annotation

Codecov / codecov/patch

python/src/nuri/fmt/fmt_module.cpp#L85-L86

Added lines #L85 - L86 were not covered by tests
}
} else if (sanitize_ && !MoleculeSanitizer(mol).sanitize_all()) {
ABSL_LOG_IF(WARNING, guess_ && !mol.is_3d())
<< "Reader might produce molecules with invalid bonds, but the "
"molecule is missing 3D coordinates; guessing is disabled.";

Check warning on line 91 in python/src/nuri/fmt/fmt_module.cpp

View check run for this annotation

Codecov / codecov/patch

python/src/nuri/fmt/fmt_module.cpp#L91

Added line #L91 was not covered by tests

log_or_throw("Failed to sanitize molecule");
continue;
}
Expand All @@ -102,6 +113,7 @@
std::vector<std::string> block_;
bool sanitize_;
bool skip_on_error_;
bool guess_;
};

template <class F, class... Args>
Expand Down Expand Up @@ -146,9 +158,10 @@

:param fmt: The format of the file.
:param path: The path to the file.
:param sanitize: Whether to sanitize the produced molecule. Note that if the
underlying reader produces a sanitized molecule, this option is ignored and
the molecule is always sanitized.
:param sanitize: Whether to sanitize the produced molecule. For formats that is
known to produce molecules with insufficient bond information (e.g. PDB), this
option will trigger guessing based on the 3D coordinates
(:func:`nuri.algo.guess_everything()`).
:param skip_on_error: Whether to skip a molecule if an error occurs, instead of
raising an exception.
:raises OSError: If any file-related error occurs.
Expand All @@ -171,9 +184,10 @@

:param fmt: The format of the file.
:param data: The string to read.
:param sanitize: Whether to sanitize the produced molecule. Note that if the
underlying reader produces a sanitized molecule, this option is ignored and
the molecule is always sanitized.
:param sanitize: Whether to sanitize the produced molecule. For formats that is
known to produce molecules with insufficient bond information (e.g. PDB), this
option will trigger guessing based on the 3D coordinates
(:func:`nuri.algo.guess_everything()`).
:param skip_on_error: Whether to skip a molecule if an error occurs, instead of
raising an exception.
:raises ValueError: If the format is unknown or sanitization fails, unless
Expand Down
15 changes: 15 additions & 0 deletions src/algo/guess/3d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2170,4 +2170,19 @@ bool guess_all_types(Molecule &mol, int conf) {
reset_bonds(mol);
return guess_types_common(mol, mol.confs()[conf]);
}

namespace internal {
bool guess_update_subs(Molecule &mol) {
bool ret;
{
auto mut = mol.mutator();
ret = guess_everything(mut);
}

for (auto &sub: mol.substructures())
sub.refresh_bonds();

return ret;
}
} // namespace internal
} // namespace nuri
Loading
Loading