Skip to content

Commit

Permalink
Add gomc parser (issue #77, PR #78)
Browse files Browse the repository at this point in the history
* Add gomc parser
* Use time as index name and column name to be able to use preprocessing module.
* Add parsing test for GOMC
* Add test for GOMC parser, TI, BAR, and MBAR estimator
* Update the parsing documentation. Update the variable names in gomc parser
* close #77 

With contributions from @dotsdl (see PR #78 for commits)
  • Loading branch information
msoroush authored and orbeckst committed Jul 30, 2019
1 parent 86d2d26 commit c90bb88
Show file tree
Hide file tree
Showing 9 changed files with 276 additions and 4 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
*.pyc
.vscode
1 change: 1 addition & 0 deletions AUTHORS
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,4 @@ Chronological list of authors
2019
- Victoria Lim (@vlim)
- Hyungro Lee (@lee212)
- Mohammad S. Barhaghi (@msoroush)
1 change: 1 addition & 0 deletions CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ Enhancements
- NAMD FEP parser (#7, #75)
- BAR estimator (#40)
- enhanced performance of Gromacs parsers with pandas.read_csv() (#81)
- GOMC TI and FEP parser (#77)

Deprecations

Expand Down
1 change: 1 addition & 0 deletions docs/parsing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -111,4 +111,5 @@ See the documentation for the package you are using for more details on parser u
gmx
amber
namd
gomc

15 changes: 15 additions & 0 deletions docs/parsing/alchemlyb.parsing.gomc.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
GOMC parsing
==============
.. automodule:: alchemlyb.parsing.gomc

The parsers featured in this module are constructed to properly parse `GOMC <http://gomc.eng.wayne.edu/>`_ free energy output files,
containing the Hamiltonian derivatives (:math:`\frac{dH}{d\lambda}`) for TI-based estimators and Hamiltonian differences (:math:`\Delta H`
for all lambda states in the alchemical leg) for FEP-based estimators (BAR/MBAR).


API Reference
-------------
This submodule includes these parsing functions:

.. autofunction:: alchemlyb.parsing.gomc.extract_dHdl
.. autofunction:: alchemlyb.parsing.gomc.extract_u_nk
203 changes: 203 additions & 0 deletions src/alchemlyb/parsing/gomc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
"""Parsers for extracting alchemical data from `GOMC <http://gomc.eng.wayne.edu/>`_ output files.
"""
import pandas as pd

from .util import anyopen


# TODO: perhaps move constants elsewhere?
# these are the units we need for dealing with GOMC, so not
# a bad place for it, honestly
# (kB in kJ/molK)
k_b = 8.3144621E-3


def extract_u_nk(filename, T):
"""Return reduced potentials `u_nk` from a Hamiltonian differences dat file.
Parameters
----------
filename : str
Path to free energy file to extract data from.
T : float
Temperature in Kelvin at which the simulation was sampled.
Returns
-------
u_nk : DataFrame
Potential energy for each alchemical state (k) for each frame (n).
"""

dh_col_match = "dU/dL"
h_col_match = "DelE"
pv_col_match = 'PV'
u_col_match = ['Total_En']
beta = 1/(k_b * T)

state, lambdas, statevec = _extract_state(filename)

# extract a DataFrame from free energy file data
df = _extract_dataframe(filename)

times = df[df.columns[0]]

# want to grab only dH columns
DHcols = [col for col in df.columns if (h_col_match in col)]
dH = df[DHcols]

# GOMC also gives us pV directly; need this for reduced potential
pv_cols = [col for col in df.columns if (pv_col_match in col)]
pv = None
if pv_cols:
pv = df[pv_cols[0]]

# GOMC also gives us total energy U directly; need this for reduced potential
u_cols = [col for col in df.columns if any(single_u_col_match in col for single_u_col_match in u_col_match)]
u = None
if u_cols:
u = df[u_cols[0]]

u_k = dict()
cols = list()
for col in dH:
u_col = eval(col.split('->')[1][:-1])
# calculate reduced potential u_k = dH + pV + U
u_k[u_col] = beta * dH[col].values
if pv_cols:
u_k[u_col] += beta * pv.values
if u_cols:
u_k[u_col] += beta * u.values
cols.append(u_col)

u_k = pd.DataFrame(u_k, columns=cols,
index=pd.Float64Index(times.values, name='time'))

# Need to modify the lambda name
cols = [l + "-lambda" for l in lambdas]
# create columns for each lambda, indicating state each row sampled from
for i, l in enumerate(cols):
u_k[l] = statevec[i]

# set up new multi-index
newind = ['time'] + cols
u_k = u_k.reset_index().set_index(newind)

u_k.name = 'u_nk'

return u_k


def extract_dHdl(filename, T):
"""Return gradients `dH/dl` from a Hamiltonian differences free energy file.
Parameters
----------
filename : str
Path to free energy file to extract data from.
T : float
Temperature in Kelvin at which the simulation was sampled.
Returns
-------
dH/dl : Series
dH/dl as a function of step for this lambda window.
"""
beta = 1/(k_b * T)

state, lambdas, statevec = _extract_state(filename)

# extract a DataFrame from free energy data
df = _extract_dataframe(filename)

times = df[df.columns[0]]

# want to grab only dH/dl columns
dHcols = []
for l in lambdas:
dHcols.extend([col for col in df.columns if (l in col)])

dHdl = df[dHcols]

# make dimensionless
dHdl *= beta

dHdl = pd.DataFrame(dHdl.values, columns=lambdas,
index=pd.Float64Index(times.values, name='time'))

# Need to modify the lambda name
cols = [l + "-lambda" for l in lambdas]
# create columns for each lambda, indicating state each row sampled from
for i, l in enumerate(cols):
dHdl[l] = statevec[i]

# set up new multi-index
newind = ['time'] + cols
dHdl= dHdl.reset_index().set_index(newind)

dHdl.name='dH/dl'

return dHdl


def _extract_state(filename):
"""Extract information on state sampled, names of lambdas.
"""
state = None
with anyopen(filename, 'r') as f:
for line in f:
if ('#' in line) and ('State' in line):
state = int(line.split('State')[1].split(':')[0])
# GOMC always print these two fields
lambdas = ['Coulomb', 'VDW']
statevec = eval(line.strip().split(' = ')[-1])
break

return state, lambdas, statevec


def _extract_dataframe(filename):
"""Extract a DataFrame from free energy data.
"""
dh_col_match = "dU/dL"
h_col_match = "DelE"
pv_col_match = 'PV'
u_col_match = 'Total_En'

xaxis = "time"
with anyopen(filename, 'r') as f:
names = []
rows = []
for line in f:
line = line.strip()
if len(line) == 0:
# avoid parsing empty line
continue
elif line.startswith('#T'):
# this line has state information. No need to be parsed
continue
elif line.startswith("#Steps"):
# read the headers
elements = line.split()
for i, element in enumerate(elements):
if element.startswith(u_col_match):
names.append(element)
elif element.startswith(dh_col_match):
names.append(element)
elif element.startswith(h_col_match):
names.append(element)
elif element.startswith(pv_col_match):
names.append(element)
else:
# parse line as floats
row = map(float, line.split())
rows.append(row)

cols = [xaxis]
cols.extend(names)

return pd.DataFrame(rows, columns=cols)
32 changes: 32 additions & 0 deletions src/alchemlyb/tests/parsing/test_gomc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""GOMC parser tests.
"""

from alchemlyb.parsing.gomc import extract_dHdl, extract_u_nk
from alchemtest.gomc import load_benzene


def test_dHdl():
"""Test that dHdl has the correct form when extracted from files.
"""
dataset = load_benzene()

for filename in dataset['data']:
dHdl = extract_dHdl(filename, T=298)

assert dHdl.index.names == ['time', 'Coulomb-lambda', 'VDW-lambda']
assert dHdl.shape == (1000, 2)

def test_u_nk():
"""Test that u_nk has the correct form when extracted from files.
"""
dataset = load_benzene()

for filename in dataset['data']:
u_nk = extract_u_nk(filename, T=298)

assert u_nk.index.names == ['time', 'Coulomb-lambda', 'VDW-lambda']
assert u_nk.shape == (1000, 23)

13 changes: 12 additions & 1 deletion src/alchemlyb/tests/test_fep_estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@
import numpy as np
import pandas as pd

from alchemlyb.parsing import gmx, amber, namd
from alchemlyb.parsing import gmx, amber, namd, gomc
from alchemlyb.estimators import MBAR, BAR
import alchemtest.gmx
import alchemtest.amber
import alchemtest.gomc
import alchemtest.namd

def gmx_benzene_coul_u_nk():
Expand Down Expand Up @@ -83,6 +84,14 @@ def amber_bace_example_complex_vdw():
for filename in dataset['data']['complex']['vdw']])
return u_nk

def gomc_benzene_u_nk():
dataset = alchemtest.gomc.load_benzene()

u_nk = pd.concat([gomc.extract_u_nk(filename, T=298)
for filename in dataset['data']])

return u_nk

def namd_tyr2ala():
dataset = alchemtest.namd.load_tyr2ala()
u_nk1 = namd.extract_u_nk(dataset['data']['forward'][0], T=300)
Expand Down Expand Up @@ -126,6 +135,7 @@ class TestMBAR(FEPestimatorMixin):
(gmx_water_particle_with_potential_energy, -11.675, 0.083589),
(gmx_water_particle_without_energy, -11.654, 0.083415),
(amber_bace_example_complex_vdw, 2.40200, 0.0618453),
(gomc_benzene_u_nk, -0.79994, 0.091579),
])
def X_delta_f(self, request):
get_unk, E, dE = request.param
Expand All @@ -152,6 +162,7 @@ class TestBAR(FEPestimatorMixin):
(gmx_water_particle_without_energy, -11.660, 0.064914),
(amber_bace_example_complex_vdw, 2.37846, 0.050899),
(namd_tyr2ala, 11.0044, 0.10235),
(gomc_benzene_u_nk, -0.87095, 0.071263),
])
def X_delta_f(self, request):
get_unk, E, dE = request.param
Expand Down
13 changes: 10 additions & 3 deletions src/alchemlyb/tests/test_ti_estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@

import pandas as pd

from alchemlyb.parsing import gmx, amber
from alchemlyb.parsing import gmx, amber, gomc
from alchemlyb.estimators import TI
import alchemtest.gmx
import alchemtest.amber
import alchemtest.gomc


def gmx_benzene_coul_dHdl():
Expand Down Expand Up @@ -91,9 +92,17 @@ def amber_simplesolvated_vdw_dHdl():

return dHdl

def gomc_benzene_dHdl():
dataset = alchemtest.gomc.load_benzene()

dHdl = pd.concat([gomc.extract_dHdl(filename, T=298)
for filename in dataset['data']])

return dHdl


class TIestimatorMixin:

def test_get_delta_f(self, X_delta_f):
dHdl, E, dE = X_delta_f
est = self.cls().fit(dHdl)
Expand Down Expand Up @@ -127,5 +136,3 @@ class TestTI(TIestimatorMixin):
def X_delta_f(self, request):
get_dHdl, E, dE = request.param
return get_dHdl(), E, dE


0 comments on commit c90bb88

Please sign in to comment.