theochem · FanwangM · Sep 16, 2023 · Jun 20, 2023 · Jun 20, 2023 · Jun 20, 2023
diff --git a/DiverseSelector/diversity.py b/DiverseSelector/diversity.py
@@ -42,7 +42,8 @@
 
 def compute_diversity(
     features: np.array,
-    div_type: str = "total_diversity_volume",
+    div_type: str = "entropy",
+    library: np.array = None,
 ) -> float:
     """Compute diversity metrics.
 
@@ -53,11 +54,11 @@ def compute_diversity(
     div_type : str, optional
         Method of calculation diversity for a given molecule set, which
         includes "entropy", "logdet", "shannon_entropy", "wdud",
-        gini_coefficient" and "total_diversity_volume". Default is "total_diversity_volume".
-    mols : List[rdkit.Chem.rdchem.Mol], optional
-        List of RDKit molecule objects. This is only needed when using the
-        "explicit_diversity_index" method. Default=None.
-
+        gini_coefficient" and "hypersphere_overlap_of_subset".
+        Default is "entropy".
+    library : np.ndarray, optional
+        Feature matrix of entire molecule library, used only if
+        calculating hypersphere_overlap_of_subset. Default is "None".
     Returns
     -------
     float, computed diversity.
@@ -68,12 +69,13 @@ def compute_diversity(
         "logdet": logdet,
         "shannon_entropy": shannon_entropy,
         "wdud": wdud,
-        "total_diversity_volume": total_diversity_volume,
         "gini_coefficient": gini_coefficient,
     }
 
     if div_type in func_dict:
         return func_dict[div_type](features)
+    elif div_type == "hypersphere_overlap_of_subset":
+        return hypersphere_overlap_of_subset(library, features)
     else:
         raise ValueError(f"Diversity type {div_type} not supported.")
 
@@ -83,11 +85,13 @@ def entropy(x: np.ndarray) -> float:
 
     The equation for entropy is
     .. math::
-        E = $-\frac{\sum{\frac{y_i}{N}\ln{\frac{y_i}{N}}}}{L\frac{\ln{2}}{2}}$
+        E = -\frac{\sum{\frac{y_i}{N}\ln{\frac{y_i}{N}}}}{L\frac{\ln{2}}{2}}
 
     where N is the number of molecules in the set, L is the length of the fingerprint,
     and :math:y_i is a vector of the bitcounts of each feature in the fingerprints.
 
+    Higher values mean more diversity.
+
     Parameters
     ----------
     x : ndarray
@@ -96,53 +100,46 @@ def entropy(x: np.ndarray) -> float:
     Returns
     -------
     e : float
-        Entropy of matrix.
+        Entropy of matrix in the range [0,1].
 
     Notes
     -----
-    Feature matrices are converted to bits,
-    so we lose any information associated with num in matrix.
+    The feature matrix does not need to be in bitstring form to use this function.
+    However, the features should be encoded in a binary way, such that 0 means
+    absence of the feature, and nonzero means presence of the feature.
+
     Weidlich, I. E., and Filippov, I. V. (2016)
     Using the Gini coefficient to measure the chemical diversity of small-molecule libraries.
     Journal of Computational Chemistry 37, 2091-2097.
     """
 
-    # convert input matrix to bit matrix
-    y = np.empty(x.shape)
-    for i in range(0, len(x)):
-        for j in range(0, len(x[0])):
-            if x[i, j] != 0:
-                y[i, j] = 1
-            else:
-                y[i, j] = 0
     # initialize variables
     length = len(x[0])
     n = len(x)
     top = 0
-    val = []
-    # count bits in fingerprint
-    for i in range(0, length):
-        val.append(sum(y[:, i]))
-    ans = np.sort(val)
+    # count bits in fingerprint and order them
+    counts = np.count_nonzero(x, axis=0)
+    counts = np.sort(counts)
     # sum entropy calculation for each feature
+    if 0 in counts:
+        raise ValueError
     for i in range(0, length):
-        if ans[i] == 0:
-            raise ValueError
-        if ans[i] != 0:
-            top += ((ans[i]) / n) * (np.log(ans[i] / n))
+        top += ((counts[i]) / n) * (np.log(counts[i] / n))
     e = -1 * (top / (length * 0.34657359027997264))
     return e
 
 
 def logdet(x: np.ndarray) -> float:
     r"""Computes the log determinant function.
 
-    Input is an :math:S\times :math:n feature matrix with
-    :math:S molecules and :math:n features.
+    Input is an :math:`S\times n` feature matrix with
+    :math:`S` molecules and :math:`n` features.
 
     .. math:
         F_{logdet}\left(S\right) = \log{\det{\left(X[S]X[S]^T + I_{|S|} \right)}}
 
+    Higher values mean more diversity.
+
     Parameters
     ----------
     x : ndarray(S, n)
@@ -151,7 +148,7 @@ def logdet(x: np.ndarray) -> float:
     Returns
     -------
     f_logdet: float
-        The volume of parallelotope spand by the matrix.
+        The volume of parallelotope spanned by the matrix.
 
     Notes
     -----
@@ -173,8 +170,10 @@ def shannon_entropy(x: np.ndarray) -> float:
     .. math::
         H(X) = \sum_{i=1}^{n}-P_i(X)\log{P_i(X)}
 
-    where X is the feature matrix, n is the number of features, and :math:`P_i(X)` is the
-    proportion of molecules that have feature :math:i in :math:X.
+    where :math:`X` is the feature matrix, :math:`n` is the number of features, and :math:`P_i(X)` is the
+    proportion of molecules that have feature :math:`i` in :math:`X`.
+
+    Higher values mean more diversity.
 
     Parameters
     ----------
@@ -183,7 +182,7 @@ def shannon_entropy(x: np.ndarray) -> float:
 
     Returns
     -------
-    h_x: float
+    float :
         The shannon entropy of the matrix.
 
     Notes
@@ -218,23 +217,27 @@ def wdud(x: np.ndarray) -> float:
     distribution of the values of the feature in :math:`x`, where :math:`y` is the ith feature. This
     integral is calculated iteratively between :math:y_i and :math:y_{i+1}, using trapezoidal method.
 
+    Lower values mean more diversity.
+
     Parameters
     ----------
     x : ndarray(N, K)
         Feature array of N molecules and K features.
 
     Returns
     -------
-    float:
+    float :
         The mean of the WDUD of each feature over all molecules.
 
     Notes
     -----
+    Lower values of the WDUD mean more diversity because the features of the selected set are
+     more evenly distributed over the range of feature values.
+
     Nakamura, T., Sakaue, S., Fujii, K., Harabuchi, Y., Maeda, S., and Iwata, S.. (2022)
     Selecting molecules with diverse structures and properties by maximizing
     submodular functions of descriptors learned with graph neural networks.
     Scientific Reports 12.
-
     """
     if x.ndim != 2:
         raise ValueError(f"The number of dimensions {x.ndim} should be two.")
@@ -272,47 +275,52 @@ def wdud(x: np.ndarray) -> float:
     return np.average(ans)
 
 
-def hypersphere_overlap_of_subset(lib: np.ndarray, x: np.array) -> float:
+def hypersphere_overlap_of_subset(x: np.ndarray, x_subset: np.array) -> float:
     r"""Computes the overlap of subset with hyper-spheres around each point
 
     The edge penalty is also included, which disregards areas
     outside of the boundary of the full feature space/library.
     This is calculated as:
 
     .. math::
-        g(S) = \sum_{i < j}^k O(i, j) + \sum^k_m E(m),
+        g(S) = \sum_{i < j}^k O(i, j) + \sum^k_m E(m)
 
     where :math:`i, j` is over the subset of molecules,
-    :math:`O(i, j)` is the approximate overlap between hyper-spheres,
+    :math:`O(i, j)` is the approximate overlap between hyperspheres,
     :math:`k` is the number of features and :math:`E`
     is the edge penalty of a molecule.
 
+    Lower values mean more diversity.
+
     Parameters
     ----------
-    lib : ndarray
-        Feature matrix of all molecules.
     x : ndarray
+        Feature matrix of all molecules.
+    x_subset : ndarray
         Feature matrix of selected subset of molecules.
 
     Returns
     -------
-    g_s: float
-        The total diversity volume of the matrix.
+    float :
+        The approximate overlapping volume of hyperspheres
+        drawn around the selected points/molecules.
 
     Notes
     -----
+    The hypersphere overlap volume is calculated using an approximation formula from Agrafiotis (1997).
+
     Agrafiotis, D. K.. (1997) Stochastic Algorithms for Maximizing Molecular Diversity.
     Journal of Chemical Information and Computer Sciences 37, 841-851.
     """
     d = len(x[0])
     k = len(x[:, 0])
     # Find the maximum and minimum over each feature across all molecules.
-    max_x = np.max(lib, axis=0)
-    min_x = np.min(lib, axis=0)
+    max_x = np.max(x, axis=0)
+    min_x = np.min(x, axis=0)
     # Normalization of each feature to [0, 1]
     if np.any(np.abs(max_x - min_x) < 1e-30):
         raise ValueError(f"One of the features is redundant and causes normalization to fail.")
-    x_norm = (x - min_x) / (max_x - min_x)
+    x_norm = (x_subset - min_x) / (max_x - min_x)
     # r_o = hypersphere radius
     r_o = d * np.sqrt(1 / k)
     if r_o > 0.5:
@@ -350,7 +358,7 @@ def hypersphere_overlap_of_subset(lib: np.ndarray, x: np.array) -> float:
     return g_s
 
 
-def gini_coefficient(a: np.ndarray):
+def gini_coefficient(x: np.ndarray):
     r"""
     Gini coefficient of bit-wise fingerprints of a database of molecules.
 
@@ -363,33 +371,35 @@ def gini_coefficient(a: np.ndarray):
     where :math:`y_i \in \{0, 1\}^N` is a vector of zero and ones of length the
     number of molecules :math:`N` of the `i`th feature, and :math:`L` is the feature length.
 
+    Lower values mean more diversity.
+
     Parameters
     ----------
-    a : ndarray(N, L)
+    x : ndarray(N, L)
         Molecule features in L bits with N molecules.
 
     Returns
     -------
     float :
-        Gini coefficient between zero and one, where closer to zero indicates more diversity.
+        Gini coefficient in the range [0,1].
 
     References
     ----------
-    .. [1] Weidlich, Iwona E., and Igor V. Filippov. "Using the gini coefficient to measure the
-           chemical diversity of small‐molecule libraries." (2016): 2091-2097.
+    Weidlich, Iwona E., and Igor V. Filippov. "Using the gini coefficient to measure the
+    chemical diversity of small‐molecule libraries." (2016): 2091-2097.
 
     """
-    # Check that `a` is a bit-wise fingerprint.
-    if np.any(np.abs(np.sort(np.unique(a)) - np.array([0, 1])) > 1e-8):
-        raise ValueError("Attribute `a` should have binary values.")
-    if a.ndim != 2:
+    # Check that `x` is a bit-wise fingerprint.
+    if np.any(np.abs(np.sort(np.unique(x)) - np.array([0, 1])) > 1e-8):
+        raise ValueError("Attribute `x` should have binary values.")
+    if x.ndim != 2:
         raise ValueError(
-            f"Attribute `a` should have dimension two rather than {a.ndim}."
+            f"Attribute `x` should have dimension two rather than {x.ndim}."
         )
 
-    numb_features = a.shape[1]
+    numb_features = x.shape[1]
     # Take the bit-count of each column/molecule.
-    bit_count = np.sum(a, axis=0)
+    bit_count = np.sum(x, axis=0)
 
     # Sort the bit-count since Gini coefficients relies on cumulative distribution.
     bit_count = np.sort(bit_count)