Add gomc parser (issue #77, PR #78)

* Add gomc parser * Use time as index name and column name to be able to use preprocessing module. * Add parsing test for GOMC * Add test for GOMC parser, TI, BAR, and MBAR estimator * Update the parsing documentation. Update the variable names in gomc parser * close #77 With contributions from @dotsdl (see PR #78 for commits)
alchemistry · Jul 30, 2019 · c90bb88 · c90bb88
1 parent 86d2d26
commit c90bb88
Show file tree

Hide file tree

Showing 9 changed files with 276 additions and 4 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1 +1,2 @@
 *.pyc
+.vscode
diff --git a/AUTHORS b/AUTHORS
@@ -33,3 +33,4 @@ Chronological list of authors
 2019
   - Victoria Lim (@vlim)
   - Hyungro Lee (@lee212)
+  - Mohammad S. Barhaghi (@msoroush)
diff --git a/CHANGES b/CHANGES
@@ -25,6 +25,7 @@ Enhancements
   - NAMD FEP parser (#7, #75)
   - BAR estimator (#40)
   - enhanced performance of Gromacs parsers with pandas.read_csv() (#81)
+  - GOMC TI and FEP parser (#77)
 
 Deprecations
 

diff --git a/docs/parsing.rst b/docs/parsing.rst
@@ -111,4 +111,5 @@ See the documentation for the package you are using for more details on parser u
     gmx
     amber
     namd
+    gomc
 
diff --git a/docs/parsing/alchemlyb.parsing.gomc.rst b/docs/parsing/alchemlyb.parsing.gomc.rst
@@ -0,0 +1,15 @@
+GOMC parsing
+==============
+.. automodule:: alchemlyb.parsing.gomc
+
+The parsers featured in this module are constructed to properly parse `GOMC <http://gomc.eng.wayne.edu/>`_ free energy output files, 
+containing the Hamiltonian derivatives (:math:`\frac{dH}{d\lambda}`) for TI-based estimators and Hamiltonian differences (:math:`\Delta H` 
+for all lambda states in the alchemical leg) for FEP-based estimators (BAR/MBAR).
+
+
+API Reference
+-------------
+This submodule includes these parsing functions:
+
+.. autofunction:: alchemlyb.parsing.gomc.extract_dHdl
+.. autofunction:: alchemlyb.parsing.gomc.extract_u_nk
diff --git a/src/alchemlyb/parsing/gomc.py b/src/alchemlyb/parsing/gomc.py
@@ -0,0 +1,203 @@
+"""Parsers for extracting alchemical data from `GOMC <http://gomc.eng.wayne.edu/>`_ output files.
+
+"""
+import pandas as pd
+
+from .util import anyopen
+
+
+# TODO: perhaps move constants elsewhere?
+# these are the units we need for dealing with GOMC, so not
+# a bad place for it, honestly
+# (kB in kJ/molK)
+k_b = 8.3144621E-3
+
+
+def extract_u_nk(filename, T):
+    """Return reduced potentials `u_nk` from a Hamiltonian differences dat file.
+
+    Parameters
+    ----------
+    filename : str
+        Path to free energy file to extract data from.
+    T : float
+        Temperature in Kelvin at which the simulation was sampled.
+
+    Returns
+    -------
+    u_nk : DataFrame
+        Potential energy for each alchemical state (k) for each frame (n).
+
+    """
+
+    dh_col_match = "dU/dL"
+    h_col_match = "DelE"
+    pv_col_match = 'PV'
+    u_col_match = ['Total_En']
+    beta = 1/(k_b * T)
+
+    state, lambdas, statevec = _extract_state(filename)
+
+    # extract a DataFrame from free energy file data
+    df = _extract_dataframe(filename)
+
+    times = df[df.columns[0]]
+
+    # want to grab only dH columns
+    DHcols = [col for col in df.columns if (h_col_match in col)]
+    dH = df[DHcols]
+
+    # GOMC also gives us pV directly; need this for reduced potential
+    pv_cols = [col for col in df.columns if (pv_col_match in col)]
+    pv = None
+    if pv_cols:
+        pv = df[pv_cols[0]]
+
+    # GOMC also gives us total energy U directly; need this for reduced potential
+    u_cols = [col for col in df.columns if any(single_u_col_match in col for single_u_col_match in u_col_match)]
+    u = None
+    if u_cols:
+        u = df[u_cols[0]]
+
+    u_k = dict()
+    cols = list()
+    for col in dH:
+        u_col = eval(col.split('->')[1][:-1])
+        # calculate reduced potential u_k = dH + pV + U
+        u_k[u_col] = beta * dH[col].values
+        if pv_cols:
+            u_k[u_col] += beta * pv.values
+        if u_cols:
+            u_k[u_col] += beta * u.values
+        cols.append(u_col)
+
+    u_k = pd.DataFrame(u_k, columns=cols,
+                       index=pd.Float64Index(times.values, name='time'))
+
+    # Need to modify the lambda name
+    cols = [l + "-lambda" for l in lambdas]
+    # create columns for each lambda, indicating state each row sampled from
+    for i, l in enumerate(cols):
+        u_k[l] = statevec[i]
+
+    # set up new multi-index
+    newind = ['time'] + cols
+    u_k = u_k.reset_index().set_index(newind)
+
+    u_k.name = 'u_nk'
+
+    return u_k
+
+
+def extract_dHdl(filename, T):
+    """Return gradients `dH/dl` from a Hamiltonian differences free energy file.
+
+    Parameters
+    ----------
+    filename : str
+        Path to free energy file to extract data from.
+    T : float
+        Temperature in Kelvin at which the simulation was sampled.
+
+    Returns
+    -------
+    dH/dl : Series
+        dH/dl as a function of step for this lambda window.
+
+    """
+    beta = 1/(k_b * T)
+
+    state, lambdas, statevec = _extract_state(filename)
+
+    # extract a DataFrame from free energy data
+    df = _extract_dataframe(filename)
+
+    times = df[df.columns[0]]
+
+    # want to grab only dH/dl columns
+    dHcols = []
+    for l in lambdas:
+        dHcols.extend([col for col in df.columns if (l in col)])
+
+    dHdl = df[dHcols]
+
+    # make dimensionless
+    dHdl *= beta
+
+    dHdl = pd.DataFrame(dHdl.values, columns=lambdas,
+                        index=pd.Float64Index(times.values, name='time'))
+
+    # Need to modify the lambda name
+    cols = [l + "-lambda" for l in lambdas]
+    # create columns for each lambda, indicating state each row sampled from
+    for i, l in enumerate(cols):
+        dHdl[l] = statevec[i]
+
+    # set up new multi-index
+    newind = ['time'] + cols
+    dHdl= dHdl.reset_index().set_index(newind)
+
+    dHdl.name='dH/dl'
+
+    return dHdl
+
+
+def _extract_state(filename):
+    """Extract information on state sampled, names of lambdas.
+
+    """
+    state = None
+    with anyopen(filename, 'r') as f:
+        for line in f:
+            if ('#' in line) and ('State' in line):
+                state = int(line.split('State')[1].split(':')[0])
+                # GOMC always print these two fields
+                lambdas = ['Coulomb', 'VDW']
+                statevec = eval(line.strip().split(' = ')[-1])
+                break
+
+    return state, lambdas, statevec
+
+
+def _extract_dataframe(filename):
+    """Extract a DataFrame from free energy data.
+
+    """
+    dh_col_match = "dU/dL"
+    h_col_match = "DelE"
+    pv_col_match = 'PV'
+    u_col_match = 'Total_En'
+
+    xaxis = "time"
+    with anyopen(filename, 'r') as f:
+        names = []
+        rows = []
+        for line in f:
+            line = line.strip()
+            if len(line) == 0:
+                # avoid parsing empty line
+                continue
+            elif line.startswith('#T'):
+                # this line has state information. No need to be parsed
+                continue
+            elif line.startswith("#Steps"):
+                # read the headers
+                elements = line.split()
+                for i, element in enumerate(elements):
+                    if element.startswith(u_col_match):
+                        names.append(element)
+                    elif element.startswith(dh_col_match):
+                        names.append(element)
+                    elif element.startswith(h_col_match):
+                        names.append(element)
+                    elif element.startswith(pv_col_match):
+                        names.append(element)
+            else:
+                # parse line as floats
+                row = map(float, line.split())
+                rows.append(row)
+
+    cols = [xaxis]
+    cols.extend(names)
+
+    return pd.DataFrame(rows, columns=cols)
diff --git a/src/alchemlyb/tests/parsing/test_gomc.py b/src/alchemlyb/tests/parsing/test_gomc.py
@@ -0,0 +1,32 @@
+"""GOMC parser tests.
+
+"""
+
+from alchemlyb.parsing.gomc import extract_dHdl, extract_u_nk
+from alchemtest.gomc import load_benzene
+
+
+def test_dHdl():
+    """Test that dHdl has the correct form when extracted from files.
+
+    """
+    dataset = load_benzene()
+
+    for filename in dataset['data']:
+        dHdl = extract_dHdl(filename, T=298)
+
+        assert dHdl.index.names == ['time', 'Coulomb-lambda', 'VDW-lambda']
+        assert dHdl.shape == (1000, 2)
+
+def test_u_nk():
+    """Test that u_nk has the correct form when extracted from files.
+
+    """
+    dataset = load_benzene()
+
+    for filename in dataset['data']:
+        u_nk = extract_u_nk(filename, T=298)
+
+        assert u_nk.index.names == ['time', 'Coulomb-lambda', 'VDW-lambda']
+        assert u_nk.shape == (1000, 23)
+
diff --git a/src/alchemlyb/tests/test_fep_estimators.py b/src/alchemlyb/tests/test_fep_estimators.py
@@ -6,10 +6,11 @@
 import numpy as np
 import pandas as pd
 
-from alchemlyb.parsing import gmx, amber, namd
+from alchemlyb.parsing import gmx, amber, namd, gomc
 from alchemlyb.estimators import MBAR, BAR
 import alchemtest.gmx
 import alchemtest.amber
+import alchemtest.gomc
 import alchemtest.namd
 
 def gmx_benzene_coul_u_nk():
@@ -83,6 +84,14 @@ def amber_bace_example_complex_vdw():
                       for filename in dataset['data']['complex']['vdw']])
     return u_nk
 
+def gomc_benzene_u_nk():
+    dataset = alchemtest.gomc.load_benzene()
+
+    u_nk = pd.concat([gomc.extract_u_nk(filename, T=298)
+                      for filename in dataset['data']])
+
+    return u_nk
+
 def namd_tyr2ala():
     dataset = alchemtest.namd.load_tyr2ala()
     u_nk1 = namd.extract_u_nk(dataset['data']['forward'][0], T=300)
@@ -126,6 +135,7 @@ class TestMBAR(FEPestimatorMixin):
                             (gmx_water_particle_with_potential_energy, -11.675, 0.083589),
                             (gmx_water_particle_without_energy, -11.654, 0.083415),
                             (amber_bace_example_complex_vdw, 2.40200, 0.0618453),
+                            (gomc_benzene_u_nk, -0.79994, 0.091579),
                     ])
     def X_delta_f(self, request):
         get_unk, E, dE = request.param
@@ -152,6 +162,7 @@ class TestBAR(FEPestimatorMixin):
                               (gmx_water_particle_without_energy, -11.660, 0.064914),
                               (amber_bace_example_complex_vdw, 2.37846, 0.050899),
                               (namd_tyr2ala, 11.0044, 0.10235),
+                              (gomc_benzene_u_nk, -0.87095, 0.071263),
                     ])
     def X_delta_f(self, request):
         get_unk, E, dE = request.param

diff --git a/src/alchemlyb/tests/test_ti_estimators.py b/src/alchemlyb/tests/test_ti_estimators.py
@@ -5,10 +5,11 @@
 
 import pandas as pd
 
-from alchemlyb.parsing import gmx, amber
+from alchemlyb.parsing import gmx, amber, gomc
 from alchemlyb.estimators import TI
 import alchemtest.gmx
 import alchemtest.amber
+import alchemtest.gomc
 
 
 def gmx_benzene_coul_dHdl():
@@ -91,9 +92,17 @@ def amber_simplesolvated_vdw_dHdl():
 
     return dHdl
 
+def gomc_benzene_dHdl():
+    dataset = alchemtest.gomc.load_benzene()
+
+    dHdl = pd.concat([gomc.extract_dHdl(filename, T=298)
+                      for filename in dataset['data']])
+
+    return dHdl
 
 
 class TIestimatorMixin:
+
     def test_get_delta_f(self, X_delta_f):
         dHdl, E, dE = X_delta_f
         est = self.cls().fit(dHdl)
@@ -127,5 +136,3 @@ class TestTI(TIestimatorMixin):
     def X_delta_f(self, request):
         get_dHdl, E, dE = request.param
         return get_dHdl(), E, dE
-
-