Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix diversity #138

Merged
merged 18 commits into from
Sep 16, 2023
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 68 additions & 58 deletions DiverseSelector/diversity.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@

def compute_diversity(
features: np.array,
div_type: str = "total_diversity_volume",
div_type: str = "entropy",
library: np.array = None,
FarnazH marked this conversation as resolved.
Show resolved Hide resolved
) -> float:
"""Compute diversity metrics.

Expand All @@ -53,11 +54,11 @@ def compute_diversity(
div_type : str, optional
Method of calculation diversity for a given molecule set, which
includes "entropy", "logdet", "shannon_entropy", "wdud",
gini_coefficient" and "total_diversity_volume". Default is "total_diversity_volume".
mols : List[rdkit.Chem.rdchem.Mol], optional
List of RDKit molecule objects. This is only needed when using the
"explicit_diversity_index" method. Default=None.

gini_coefficient" and "hypersphere_overlap_of_subset".
Default is "entropy".
library : np.ndarray, optional
Feature matrix of entire molecule library, used only if
calculating hypersphere_overlap_of_subset. Default is "None".
Returns
-------
float, computed diversity.
Expand All @@ -68,12 +69,13 @@ def compute_diversity(
"logdet": logdet,
"shannon_entropy": shannon_entropy,
"wdud": wdud,
"total_diversity_volume": total_diversity_volume,
"gini_coefficient": gini_coefficient,
}

if div_type in func_dict:
return func_dict[div_type](features)
elif div_type == "hypersphere_overlap_of_subset":
return hypersphere_overlap_of_subset(library, features)
FarnazH marked this conversation as resolved.
Show resolved Hide resolved
else:
raise ValueError(f"Diversity type {div_type} not supported.")

Expand All @@ -83,11 +85,13 @@ def entropy(x: np.ndarray) -> float:

The equation for entropy is
.. math::
E = $-\frac{\sum{\frac{y_i}{N}\ln{\frac{y_i}{N}}}}{L\frac{\ln{2}}{2}}$
E = -\frac{\sum{\frac{y_i}{N}\ln{\frac{y_i}{N}}}}{L\frac{\ln{2}}{2}}

where N is the number of molecules in the set, L is the length of the fingerprint,
and :math:y_i is a vector of the bitcounts of each feature in the fingerprints.
FanwangM marked this conversation as resolved.
Show resolved Hide resolved

Higher values mean more diversity.

Parameters
----------
x : ndarray
Expand All @@ -96,53 +100,46 @@ def entropy(x: np.ndarray) -> float:
Returns
-------
e : float
Entropy of matrix.
Entropy of matrix in the range [0,1].

Notes
-----
Feature matrices are converted to bits,
so we lose any information associated with num in matrix.
The feature matrix does not need to be in bitstring form to use this function.
However, the features should be encoded in a binary way, such that 0 means
absence of the feature, and nonzero means presence of the feature.

Weidlich, I. E., and Filippov, I. V. (2016)
Using the Gini coefficient to measure the chemical diversity of small-molecule libraries.
Journal of Computational Chemistry 37, 2091-2097.
FanwangM marked this conversation as resolved.
Show resolved Hide resolved
"""

# convert input matrix to bit matrix
y = np.empty(x.shape)
for i in range(0, len(x)):
for j in range(0, len(x[0])):
if x[i, j] != 0:
y[i, j] = 1
else:
y[i, j] = 0
# initialize variables
length = len(x[0])
n = len(x)
FarnazH marked this conversation as resolved.
Show resolved Hide resolved
top = 0
val = []
# count bits in fingerprint
for i in range(0, length):
val.append(sum(y[:, i]))
ans = np.sort(val)
# count bits in fingerprint and order them
counts = np.count_nonzero(x, axis=0)
counts = np.sort(counts)
# sum entropy calculation for each feature
if 0 in counts:
raise ValueError
FarnazH marked this conversation as resolved.
Show resolved Hide resolved
for i in range(0, length):
if ans[i] == 0:
raise ValueError
if ans[i] != 0:
top += ((ans[i]) / n) * (np.log(ans[i] / n))
top += ((counts[i]) / n) * (np.log(counts[i] / n))
e = -1 * (top / (length * 0.34657359027997264))
return e


def logdet(x: np.ndarray) -> float:
r"""Computes the log determinant function.

Input is an :math:S\times :math:n feature matrix with
:math:S molecules and :math:n features.
Input is an :math:`S\times n` feature matrix with
:math:`S` molecules and :math:`n` features.

.. math:
F_{logdet}\left(S\right) = \log{\det{\left(X[S]X[S]^T + I_{|S|} \right)}}

Higher values mean more diversity.

Parameters
----------
x : ndarray(S, n)
Expand All @@ -151,7 +148,7 @@ def logdet(x: np.ndarray) -> float:
Returns
-------
f_logdet: float
The volume of parallelotope spand by the matrix.
The volume of parallelotope spanned by the matrix.

Notes
-----
Expand All @@ -173,8 +170,10 @@ def shannon_entropy(x: np.ndarray) -> float:
.. math::
H(X) = \sum_{i=1}^{n}-P_i(X)\log{P_i(X)}

where X is the feature matrix, n is the number of features, and :math:`P_i(X)` is the
proportion of molecules that have feature :math:i in :math:X.
where :math:`X` is the feature matrix, :math:`n` is the number of features, and :math:`P_i(X)` is the
proportion of molecules that have feature :math:`i` in :math:`X`.

Higher values mean more diversity.

Parameters
----------
Expand All @@ -183,7 +182,7 @@ def shannon_entropy(x: np.ndarray) -> float:

Returns
-------
h_x: float
float :
The shannon entropy of the matrix.

Notes
Expand Down Expand Up @@ -218,23 +217,27 @@ def wdud(x: np.ndarray) -> float:
distribution of the values of the feature in :math:`x`, where :math:`y` is the ith feature. This
integral is calculated iteratively between :math:y_i and :math:y_{i+1}, using trapezoidal method.

Lower values mean more diversity.

Parameters
----------
x : ndarray(N, K)
Feature array of N molecules and K features.

Returns
-------
float:
float :
The mean of the WDUD of each feature over all molecules.

Notes
-----
Lower values of the WDUD mean more diversity because the features of the selected set are
more evenly distributed over the range of feature values.

Nakamura, T., Sakaue, S., Fujii, K., Harabuchi, Y., Maeda, S., and Iwata, S.. (2022)
Selecting molecules with diverse structures and properties by maximizing
submodular functions of descriptors learned with graph neural networks.
Scientific Reports 12.

"""
if x.ndim != 2:
raise ValueError(f"The number of dimensions {x.ndim} should be two.")
Expand Down Expand Up @@ -272,47 +275,52 @@ def wdud(x: np.ndarray) -> float:
return np.average(ans)


def hypersphere_overlap_of_subset(lib: np.ndarray, x: np.array) -> float:
def hypersphere_overlap_of_subset(x: np.ndarray, x_subset: np.array) -> float:
r"""Computes the overlap of subset with hyper-spheres around each point

The edge penalty is also included, which disregards areas
outside of the boundary of the full feature space/library.
This is calculated as:

.. math::
g(S) = \sum_{i < j}^k O(i, j) + \sum^k_m E(m),
g(S) = \sum_{i < j}^k O(i, j) + \sum^k_m E(m)

where :math:`i, j` is over the subset of molecules,
:math:`O(i, j)` is the approximate overlap between hyper-spheres,
:math:`O(i, j)` is the approximate overlap between hyperspheres,
:math:`k` is the number of features and :math:`E`
is the edge penalty of a molecule.

Lower values mean more diversity.

Parameters
----------
lib : ndarray
Feature matrix of all molecules.
x : ndarray
Feature matrix of all molecules.
x_subset : ndarray
Feature matrix of selected subset of molecules.

Returns
-------
g_s: float
The total diversity volume of the matrix.
float :
The approximate overlapping volume of hyperspheres
drawn around the selected points/molecules.

Notes
-----
The hypersphere overlap volume is calculated using an approximation formula from Agrafiotis (1997).

Agrafiotis, D. K.. (1997) Stochastic Algorithms for Maximizing Molecular Diversity.
Journal of Chemical Information and Computer Sciences 37, 841-851.
"""
d = len(x[0])
k = len(x[:, 0])
# Find the maximum and minimum over each feature across all molecules.
max_x = np.max(lib, axis=0)
min_x = np.min(lib, axis=0)
max_x = np.max(x, axis=0)
min_x = np.min(x, axis=0)
# Normalization of each feature to [0, 1]
if np.any(np.abs(max_x - min_x) < 1e-30):
raise ValueError(f"One of the features is redundant and causes normalization to fail.")
x_norm = (x - min_x) / (max_x - min_x)
x_norm = (x_subset - min_x) / (max_x - min_x)
# r_o = hypersphere radius
r_o = d * np.sqrt(1 / k)
if r_o > 0.5:
Expand Down Expand Up @@ -350,7 +358,7 @@ def hypersphere_overlap_of_subset(lib: np.ndarray, x: np.array) -> float:
return g_s


def gini_coefficient(a: np.ndarray):
def gini_coefficient(x: np.ndarray):
r"""
Gini coefficient of bit-wise fingerprints of a database of molecules.

Expand All @@ -363,33 +371,35 @@ def gini_coefficient(a: np.ndarray):
where :math:`y_i \in \{0, 1\}^N` is a vector of zero and ones of length the
number of molecules :math:`N` of the `i`th feature, and :math:`L` is the feature length.

Lower values mean more diversity.

Parameters
----------
a : ndarray(N, L)
x : ndarray(N, L)
Molecule features in L bits with N molecules.

Returns
-------
float :
Gini coefficient between zero and one, where closer to zero indicates more diversity.
Gini coefficient in the range [0,1].

References
----------
.. [1] Weidlich, Iwona E., and Igor V. Filippov. "Using the gini coefficient to measure the
chemical diversity of small‐molecule libraries." (2016): 2091-2097.
Weidlich, Iwona E., and Igor V. Filippov. "Using the gini coefficient to measure the
chemical diversity of small‐molecule libraries." (2016): 2091-2097.

"""
# Check that `a` is a bit-wise fingerprint.
if np.any(np.abs(np.sort(np.unique(a)) - np.array([0, 1])) > 1e-8):
raise ValueError("Attribute `a` should have binary values.")
if a.ndim != 2:
# Check that `x` is a bit-wise fingerprint.
if np.any(np.abs(np.sort(np.unique(x)) - np.array([0, 1])) > 1e-8):
raise ValueError("Attribute `x` should have binary values.")
if x.ndim != 2:
raise ValueError(
f"Attribute `a` should have dimension two rather than {a.ndim}."
f"Attribute `x` should have dimension two rather than {x.ndim}."
)

numb_features = a.shape[1]
numb_features = x.shape[1]
# Take the bit-count of each column/molecule.
bit_count = np.sum(a, axis=0)
bit_count = np.sum(x, axis=0)

# Sort the bit-count since Gini coefficients relies on cumulative distribution.
bit_count = np.sort(bit_count)
Expand Down
Loading