alchemistry · orbeckst · Jul 30, 2019 · Apr 27, 2019 · May 3, 2019 · May 6, 2019
diff --git a/.gitignore b/.gitignore
@@ -1 +1,2 @@
 *.pyc
+.vscode
diff --git a/AUTHORS b/AUTHORS
@@ -0,0 +1,36 @@
+.. -*- coding: utf-8 -*-
+
+======================
+ Authors of alchemlyb
+======================
+
+The alchemlyb project is lead by Oliver Beckstein (@orbeckst), David
+Mobley (@davidlmobley), and Michal Shirts (@mrshirts). alchemlyb was
+originally created by David Dotson (@dotsdl).
+
+All contributing authors are listed in this file below. Full name and
+GitHub handle are preferred. The repository history at
+https://github.com/alchemistry/alchemlyb and the CHANGES file show
+individual code contributions.
+
+
+Chronological list of authors
+-----------------------------
+
+2016
+  - David Dotson (@dotsdl)
+  - Ian Kenney (@ianmkenney)
+
+2017
+  - Oliver Beckstein (@orbeckst)
+  - Shuai Liu (@shuail)
+  - Travis Jensen (@trje3733)
+
+2018
+  - Bryce Allen (@brycestx)
+  - Dominik Wille (@harlor)
+
+2019
+  - Victoria Lim (@vlim)
+  - Hyungro Lee (@lee212)
+  - Mohammad S. Barhaghi (@msoroush)
diff --git a/CHANGES b/CHANGES
@@ -0,0 +1,52 @@
+# -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8 -*-
+====================
+ alchemlyb CHANGELOG
+=====================
+
+The rules for this file:
+  * entries are sorted newest-first.
+  * summarize sets of changes - don't reproduce every git log comment here.
+  * don't ever delete anything.
+  * keep the format consistent (79 char width, M/D/Y date format) and do not
+    use tabs but use spaces for formatting
+  * accompany each entry with github issue/PR number (Issue #xyz)
+  * release numbers follow "Semantic Versioning" http://semver.org
+
+------------------------------------------------------------------------------
+
+*/*/2019 dotsdl, orbeckst, shuail, trje3733, brycestx, harlor, vtlim, lee212
+
+  * 0.2.0
+
+Enhancements
+  - Amber TI parser (#10)
+  - Amber FEP (MBAR) parser (#42)
+  - Gromacs extended ensemble parser (#14)
+  - NAMD FEP parser (#7, #75)
+  - BAR estimator (#40)
+  - enhanced performance of Gromacs parsers with pandas.read_csv() (#81)
+  - GOMC TI and FEP parser (#77)
+
+Deprecations
+
+Fixes
+  - fixed TI estimator (PR #61)
+  - correctly use pV and U in the Gromacs parser (#59)
+
+Changes
+  - defaults for statistical_inefficiency() are more conservative (#39)
+
+
+05/27/2017 dotsdl, ianmkenney, orbeckst
+
+  * 0.1.0
+
+First release
+
+Features:
+
+    - Parsers for GROMACS, including reduced potentials and gradients.
+    - Subsampler functions for slicing, statitistical inefficiency, equilibration detection.
+    - Minimally functional estimators for MBAR, TI.
+    - high test coverage (works with data in alchemistry/alchemtests)
+
diff --git a/docs/parsing.rst b/docs/parsing.rst
@@ -111,4 +111,5 @@ See the documentation for the package you are using for more details on parser u
     gmx
     amber
     namd
+    gomc
 
diff --git a/docs/parsing/alchemlyb.parsing.gomc.rst b/docs/parsing/alchemlyb.parsing.gomc.rst
@@ -0,0 +1,15 @@
+GOMC parsing
+==============
+.. automodule:: alchemlyb.parsing.gomc
+
+The parsers featured in this module are constructed to properly parse `GOMC <http://gomc.eng.wayne.edu/>`_ free energy output files, 
+containing the Hamiltonian derivatives (:math:`\frac{dH}{d\lambda}`) for TI-based estimators and Hamiltonian differences (:math:`\Delta H` 
+for all lambda states in the alchemical leg) for FEP-based estimators (BAR/MBAR).
+
+
+API Reference
+-------------
+This submodule includes these parsing functions:
+
+.. autofunction:: alchemlyb.parsing.gomc.extract_dHdl
+.. autofunction:: alchemlyb.parsing.gomc.extract_u_nk
diff --git a/src/alchemlyb/parsing/gomc.py b/src/alchemlyb/parsing/gomc.py
@@ -0,0 +1,203 @@
+"""Parsers for extracting alchemical data from `GOMC <http://gomc.eng.wayne.edu/>`_ output files.
+
+"""
+import pandas as pd
+
+from .util import anyopen
+
+
+# TODO: perhaps move constants elsewhere?
+# these are the units we need for dealing with GOMC, so not
+# a bad place for it, honestly
+# (kB in kJ/molK)
+k_b = 8.3144621E-3
+
+
+def extract_u_nk(filename, T):
+    """Return reduced potentials `u_nk` from a Hamiltonian differences dat file.
+
+    Parameters
+    ----------
+    filename : str
+        Path to free energy file to extract data from.
+    T : float
+        Temperature in Kelvin at which the simulation was sampled.
+
+    Returns
+    -------
+    u_nk : DataFrame
+        Potential energy for each alchemical state (k) for each frame (n).
+
+    """
+
+    dh_col_match = "dU/dL"
+    h_col_match = "DelE"
+    pv_col_match = 'PV'
+    u_col_match = ['Total_En']
+    beta = 1/(k_b * T)
+
+    state, lambdas, statevec = _extract_state(filename)
+
+    # extract a DataFrame from free energy file data
+    df = _extract_dataframe(filename)
+
+    times = df[df.columns[0]]
+
+    # want to grab only dH columns
+    DHcols = [col for col in df.columns if (h_col_match in col)]
+    dH = df[DHcols]
+
+    # GOMC also gives us pV directly; need this for reduced potential
+    pv_cols = [col for col in df.columns if (pv_col_match in col)]
+    pv = None
+    if pv_cols:
+        pv = df[pv_cols[0]]
+
+    # GOMC also gives us total energy U directly; need this for reduced potential
+    u_cols = [col for col in df.columns if any(single_u_col_match in col for single_u_col_match in u_col_match)]
+    u = None
+    if u_cols:
+        u = df[u_cols[0]]
+
+    u_k = dict()
+    cols = list()
+    for col in dH:
+        u_col = eval(col.split('->')[1][:-1])
+        # calculate reduced potential u_k = dH + pV + U
+        u_k[u_col] = beta * dH[col].values
+        if pv_cols:
+            u_k[u_col] += beta * pv.values
+        if u_cols:
+            u_k[u_col] += beta * u.values
+        cols.append(u_col)
+
+    u_k = pd.DataFrame(u_k, columns=cols,
+                       index=pd.Float64Index(times.values, name='time'))
+
+    # Need to modify the lambda name
+    cols = [l + "-lambda" for l in lambdas]
+    # create columns for each lambda, indicating state each row sampled from
+    for i, l in enumerate(cols):
+        u_k[l] = statevec[i]
+
+    # set up new multi-index
+    newind = ['time'] + cols
+    u_k = u_k.reset_index().set_index(newind)
+
+    u_k.name = 'u_nk'
+
+    return u_k
+
+
+def extract_dHdl(filename, T):
+    """Return gradients `dH/dl` from a Hamiltonian differences free energy file.
+
+    Parameters
+    ----------
+    filename : str
+        Path to free energy file to extract data from.
+    T : float
+        Temperature in Kelvin at which the simulation was sampled.
+
+    Returns
+    -------
+    dH/dl : Series
+        dH/dl as a function of step for this lambda window.
+
+    """
+    beta = 1/(k_b * T)
+
+    state, lambdas, statevec = _extract_state(filename)
+
+    # extract a DataFrame from free energy data
+    df = _extract_dataframe(filename)
+
+    times = df[df.columns[0]]
+
+    # want to grab only dH/dl columns
+    dHcols = []
+    for l in lambdas:
+        dHcols.extend([col for col in df.columns if (l in col)])
+
+    dHdl = df[dHcols]
+
+    # make dimensionless
+    dHdl *= beta
+
+    dHdl = pd.DataFrame(dHdl.values, columns=lambdas,
+                        index=pd.Float64Index(times.values, name='time'))
+
+    # Need to modify the lambda name
+    cols = [l + "-lambda" for l in lambdas]
+    # create columns for each lambda, indicating state each row sampled from
+    for i, l in enumerate(cols):
+        dHdl[l] = statevec[i]
+
+    # set up new multi-index
+    newind = ['time'] + cols
+    dHdl= dHdl.reset_index().set_index(newind)
+
+    dHdl.name='dH/dl'
+
+    return dHdl
+
+
+def _extract_state(filename):
+    """Extract information on state sampled, names of lambdas.
+
+    """
+    state = None
+    with anyopen(filename, 'r') as f:
+        for line in f:
+            if ('#' in line) and ('State' in line):
+                state = int(line.split('State')[1].split(':')[0])
+                # GOMC always print these two fields
+                lambdas = ['Coulomb', 'VDW']
+                statevec = eval(line.strip().split(' = ')[-1])
+                break
+
+    return state, lambdas, statevec
+
+
+def _extract_dataframe(filename):
+    """Extract a DataFrame from free energy data.
+
+    """
+    dh_col_match = "dU/dL"
+    h_col_match = "DelE"
+    pv_col_match = 'PV'
+    u_col_match = 'Total_En'
+
+    xaxis = "time"
+    with anyopen(filename, 'r') as f:
+        names = []
+        rows = []
+        for line in f:
+            line = line.strip()
+            if len(line) == 0:
+                # avoid parsing empty line
+                continue
+            elif line.startswith('#T'):
+                # this line has state information. No need to be parsed
+                continue
+            elif line.startswith("#Steps"):
+                # read the headers
+                elements = line.split()
+                for i, element in enumerate(elements):
+                    if element.startswith(u_col_match):
+                        names.append(element)
+                    elif element.startswith(dh_col_match):
+                        names.append(element)
+                    elif element.startswith(h_col_match):
+                        names.append(element)
+                    elif element.startswith(pv_col_match):
+                        names.append(element)
+            else:
+                # parse line as floats
+                row = map(float, line.split())
+                rows.append(row)
+
+    cols = [xaxis]
+    cols.extend(names)
+
+    return pd.DataFrame(rows, columns=cols)
diff --git a/src/alchemlyb/tests/parsing/test_gomc.py b/src/alchemlyb/tests/parsing/test_gomc.py
@@ -0,0 +1,32 @@
+"""GOMC parser tests.
+
+"""
+
+from alchemlyb.parsing.gomc import extract_dHdl, extract_u_nk
+from alchemtest.gomc import load_benzene
+
+
+def test_dHdl():
+    """Test that dHdl has the correct form when extracted from files.
+
+    """
+    dataset = load_benzene()
+
+    for filename in dataset['data']:
+        dHdl = extract_dHdl(filename, T=298)
+
+        assert dHdl.index.names == ['time', 'Coulomb-lambda', 'VDW-lambda']
+        assert dHdl.shape == (1000, 2)
+
+def test_u_nk():
+    """Test that u_nk has the correct form when extracted from files.
+
+    """
+    dataset = load_benzene()
+
+    for filename in dataset['data']:
+        u_nk = extract_u_nk(filename, T=298)
+
+        assert u_nk.index.names == ['time', 'Coulomb-lambda', 'VDW-lambda']
+        assert u_nk.shape == (1000, 23)
+
diff --git a/src/alchemlyb/tests/test_fep_estimators.py b/src/alchemlyb/tests/test_fep_estimators.py
@@ -6,10 +6,11 @@
 import numpy as np
 import pandas as pd
 
-from alchemlyb.parsing import gmx, amber, namd
+from alchemlyb.parsing import gmx, amber, namd, gomc
 from alchemlyb.estimators import MBAR, BAR
 import alchemtest.gmx
 import alchemtest.amber
+import alchemtest.gomc
 import alchemtest.namd
 
 def gmx_benzene_coul_u_nk():
@@ -83,6 +84,14 @@ def amber_bace_example_complex_vdw():
                       for filename in dataset['data']['complex']['vdw']])
     return u_nk
 
+def gomc_benzene_u_nk():
+    dataset = alchemtest.gomc.load_benzene()
+
+    u_nk = pd.concat([gomc.extract_u_nk(filename, T=298)
+                      for filename in dataset['data']])
+
+    return u_nk
+
 def namd_tyr2ala():
     dataset = alchemtest.namd.load_tyr2ala()
     u_nk1 = namd.extract_u_nk(dataset['data']['forward'][0], T=300)
@@ -126,6 +135,7 @@ class TestMBAR(FEPestimatorMixin):
                             (gmx_water_particle_with_potential_energy, -11.675, 0.083589),
                             (gmx_water_particle_without_energy, -11.654, 0.083415),
                             (amber_bace_example_complex_vdw, 2.40200, 0.0618453),
+                            (gomc_benzene_u_nk, -0.79994, 0.091579),
                     ])
     def X_delta_f(self, request):
         get_unk, E, dE = request.param
@@ -152,6 +162,7 @@ class TestBAR(FEPestimatorMixin):
                               (gmx_water_particle_without_energy, -11.660, 0.064914),
                               (amber_bace_example_complex_vdw, 2.37846, 0.050899),
                               (namd_tyr2ala, 11.0044, 0.10235),
+                              (gomc_benzene_u_nk, -0.87095, 0.071263),
                     ])
     def X_delta_f(self, request):
         get_unk, E, dE = request.param