diff --git a/.gitignore b/.gitignore index 0d20b648..2a45ed72 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ *.pyc +.vscode diff --git a/AUTHORS b/AUTHORS index a0091c9e..672262b7 100644 --- a/AUTHORS +++ b/AUTHORS @@ -33,3 +33,4 @@ Chronological list of authors 2019 - Victoria Lim (@vlim) - Hyungro Lee (@lee212) + - Mohammad S. Barhaghi (@msoroush) \ No newline at end of file diff --git a/CHANGES b/CHANGES index bab67b0c..7a4df338 100644 --- a/CHANGES +++ b/CHANGES @@ -25,6 +25,7 @@ Enhancements - NAMD FEP parser (#7, #75) - BAR estimator (#40) - enhanced performance of Gromacs parsers with pandas.read_csv() (#81) + - GOMC TI and FEP parser (#77) Deprecations diff --git a/docs/parsing.rst b/docs/parsing.rst index 0fcf060c..c4cc96f4 100644 --- a/docs/parsing.rst +++ b/docs/parsing.rst @@ -111,4 +111,5 @@ See the documentation for the package you are using for more details on parser u gmx amber namd + gomc diff --git a/docs/parsing/alchemlyb.parsing.gomc.rst b/docs/parsing/alchemlyb.parsing.gomc.rst new file mode 100644 index 00000000..200d4a29 --- /dev/null +++ b/docs/parsing/alchemlyb.parsing.gomc.rst @@ -0,0 +1,15 @@ +GOMC parsing +============== +.. automodule:: alchemlyb.parsing.gomc + +The parsers featured in this module are constructed to properly parse `GOMC `_ free energy output files, +containing the Hamiltonian derivatives (:math:`\frac{dH}{d\lambda}`) for TI-based estimators and Hamiltonian differences (:math:`\Delta H` +for all lambda states in the alchemical leg) for FEP-based estimators (BAR/MBAR). + + +API Reference +------------- +This submodule includes these parsing functions: + +.. autofunction:: alchemlyb.parsing.gomc.extract_dHdl +.. autofunction:: alchemlyb.parsing.gomc.extract_u_nk diff --git a/src/alchemlyb/parsing/gomc.py b/src/alchemlyb/parsing/gomc.py new file mode 100644 index 00000000..dc3433b3 --- /dev/null +++ b/src/alchemlyb/parsing/gomc.py @@ -0,0 +1,203 @@ +"""Parsers for extracting alchemical data from `GOMC `_ output files. + +""" +import pandas as pd + +from .util import anyopen + + +# TODO: perhaps move constants elsewhere? +# these are the units we need for dealing with GOMC, so not +# a bad place for it, honestly +# (kB in kJ/molK) +k_b = 8.3144621E-3 + + +def extract_u_nk(filename, T): + """Return reduced potentials `u_nk` from a Hamiltonian differences dat file. + + Parameters + ---------- + filename : str + Path to free energy file to extract data from. + T : float + Temperature in Kelvin at which the simulation was sampled. + + Returns + ------- + u_nk : DataFrame + Potential energy for each alchemical state (k) for each frame (n). + + """ + + dh_col_match = "dU/dL" + h_col_match = "DelE" + pv_col_match = 'PV' + u_col_match = ['Total_En'] + beta = 1/(k_b * T) + + state, lambdas, statevec = _extract_state(filename) + + # extract a DataFrame from free energy file data + df = _extract_dataframe(filename) + + times = df[df.columns[0]] + + # want to grab only dH columns + DHcols = [col for col in df.columns if (h_col_match in col)] + dH = df[DHcols] + + # GOMC also gives us pV directly; need this for reduced potential + pv_cols = [col for col in df.columns if (pv_col_match in col)] + pv = None + if pv_cols: + pv = df[pv_cols[0]] + + # GOMC also gives us total energy U directly; need this for reduced potential + u_cols = [col for col in df.columns if any(single_u_col_match in col for single_u_col_match in u_col_match)] + u = None + if u_cols: + u = df[u_cols[0]] + + u_k = dict() + cols = list() + for col in dH: + u_col = eval(col.split('->')[1][:-1]) + # calculate reduced potential u_k = dH + pV + U + u_k[u_col] = beta * dH[col].values + if pv_cols: + u_k[u_col] += beta * pv.values + if u_cols: + u_k[u_col] += beta * u.values + cols.append(u_col) + + u_k = pd.DataFrame(u_k, columns=cols, + index=pd.Float64Index(times.values, name='time')) + + # Need to modify the lambda name + cols = [l + "-lambda" for l in lambdas] + # create columns for each lambda, indicating state each row sampled from + for i, l in enumerate(cols): + u_k[l] = statevec[i] + + # set up new multi-index + newind = ['time'] + cols + u_k = u_k.reset_index().set_index(newind) + + u_k.name = 'u_nk' + + return u_k + + +def extract_dHdl(filename, T): + """Return gradients `dH/dl` from a Hamiltonian differences free energy file. + + Parameters + ---------- + filename : str + Path to free energy file to extract data from. + T : float + Temperature in Kelvin at which the simulation was sampled. + + Returns + ------- + dH/dl : Series + dH/dl as a function of step for this lambda window. + + """ + beta = 1/(k_b * T) + + state, lambdas, statevec = _extract_state(filename) + + # extract a DataFrame from free energy data + df = _extract_dataframe(filename) + + times = df[df.columns[0]] + + # want to grab only dH/dl columns + dHcols = [] + for l in lambdas: + dHcols.extend([col for col in df.columns if (l in col)]) + + dHdl = df[dHcols] + + # make dimensionless + dHdl *= beta + + dHdl = pd.DataFrame(dHdl.values, columns=lambdas, + index=pd.Float64Index(times.values, name='time')) + + # Need to modify the lambda name + cols = [l + "-lambda" for l in lambdas] + # create columns for each lambda, indicating state each row sampled from + for i, l in enumerate(cols): + dHdl[l] = statevec[i] + + # set up new multi-index + newind = ['time'] + cols + dHdl= dHdl.reset_index().set_index(newind) + + dHdl.name='dH/dl' + + return dHdl + + +def _extract_state(filename): + """Extract information on state sampled, names of lambdas. + + """ + state = None + with anyopen(filename, 'r') as f: + for line in f: + if ('#' in line) and ('State' in line): + state = int(line.split('State')[1].split(':')[0]) + # GOMC always print these two fields + lambdas = ['Coulomb', 'VDW'] + statevec = eval(line.strip().split(' = ')[-1]) + break + + return state, lambdas, statevec + + +def _extract_dataframe(filename): + """Extract a DataFrame from free energy data. + + """ + dh_col_match = "dU/dL" + h_col_match = "DelE" + pv_col_match = 'PV' + u_col_match = 'Total_En' + + xaxis = "time" + with anyopen(filename, 'r') as f: + names = [] + rows = [] + for line in f: + line = line.strip() + if len(line) == 0: + # avoid parsing empty line + continue + elif line.startswith('#T'): + # this line has state information. No need to be parsed + continue + elif line.startswith("#Steps"): + # read the headers + elements = line.split() + for i, element in enumerate(elements): + if element.startswith(u_col_match): + names.append(element) + elif element.startswith(dh_col_match): + names.append(element) + elif element.startswith(h_col_match): + names.append(element) + elif element.startswith(pv_col_match): + names.append(element) + else: + # parse line as floats + row = map(float, line.split()) + rows.append(row) + + cols = [xaxis] + cols.extend(names) + + return pd.DataFrame(rows, columns=cols) diff --git a/src/alchemlyb/tests/parsing/test_gomc.py b/src/alchemlyb/tests/parsing/test_gomc.py new file mode 100644 index 00000000..c03a0b6b --- /dev/null +++ b/src/alchemlyb/tests/parsing/test_gomc.py @@ -0,0 +1,32 @@ +"""GOMC parser tests. + +""" + +from alchemlyb.parsing.gomc import extract_dHdl, extract_u_nk +from alchemtest.gomc import load_benzene + + +def test_dHdl(): + """Test that dHdl has the correct form when extracted from files. + + """ + dataset = load_benzene() + + for filename in dataset['data']: + dHdl = extract_dHdl(filename, T=298) + + assert dHdl.index.names == ['time', 'Coulomb-lambda', 'VDW-lambda'] + assert dHdl.shape == (1000, 2) + +def test_u_nk(): + """Test that u_nk has the correct form when extracted from files. + + """ + dataset = load_benzene() + + for filename in dataset['data']: + u_nk = extract_u_nk(filename, T=298) + + assert u_nk.index.names == ['time', 'Coulomb-lambda', 'VDW-lambda'] + assert u_nk.shape == (1000, 23) + diff --git a/src/alchemlyb/tests/test_fep_estimators.py b/src/alchemlyb/tests/test_fep_estimators.py index 839d1574..f33a72fd 100644 --- a/src/alchemlyb/tests/test_fep_estimators.py +++ b/src/alchemlyb/tests/test_fep_estimators.py @@ -6,10 +6,11 @@ import numpy as np import pandas as pd -from alchemlyb.parsing import gmx, amber, namd +from alchemlyb.parsing import gmx, amber, namd, gomc from alchemlyb.estimators import MBAR, BAR import alchemtest.gmx import alchemtest.amber +import alchemtest.gomc import alchemtest.namd def gmx_benzene_coul_u_nk(): @@ -83,6 +84,14 @@ def amber_bace_example_complex_vdw(): for filename in dataset['data']['complex']['vdw']]) return u_nk +def gomc_benzene_u_nk(): + dataset = alchemtest.gomc.load_benzene() + + u_nk = pd.concat([gomc.extract_u_nk(filename, T=298) + for filename in dataset['data']]) + + return u_nk + def namd_tyr2ala(): dataset = alchemtest.namd.load_tyr2ala() u_nk1 = namd.extract_u_nk(dataset['data']['forward'][0], T=300) @@ -126,6 +135,7 @@ class TestMBAR(FEPestimatorMixin): (gmx_water_particle_with_potential_energy, -11.675, 0.083589), (gmx_water_particle_without_energy, -11.654, 0.083415), (amber_bace_example_complex_vdw, 2.40200, 0.0618453), + (gomc_benzene_u_nk, -0.79994, 0.091579), ]) def X_delta_f(self, request): get_unk, E, dE = request.param @@ -152,6 +162,7 @@ class TestBAR(FEPestimatorMixin): (gmx_water_particle_without_energy, -11.660, 0.064914), (amber_bace_example_complex_vdw, 2.37846, 0.050899), (namd_tyr2ala, 11.0044, 0.10235), + (gomc_benzene_u_nk, -0.87095, 0.071263), ]) def X_delta_f(self, request): get_unk, E, dE = request.param diff --git a/src/alchemlyb/tests/test_ti_estimators.py b/src/alchemlyb/tests/test_ti_estimators.py index 537c3d08..208e27a9 100644 --- a/src/alchemlyb/tests/test_ti_estimators.py +++ b/src/alchemlyb/tests/test_ti_estimators.py @@ -5,10 +5,11 @@ import pandas as pd -from alchemlyb.parsing import gmx, amber +from alchemlyb.parsing import gmx, amber, gomc from alchemlyb.estimators import TI import alchemtest.gmx import alchemtest.amber +import alchemtest.gomc def gmx_benzene_coul_dHdl(): @@ -91,9 +92,17 @@ def amber_simplesolvated_vdw_dHdl(): return dHdl +def gomc_benzene_dHdl(): + dataset = alchemtest.gomc.load_benzene() + + dHdl = pd.concat([gomc.extract_dHdl(filename, T=298) + for filename in dataset['data']]) + + return dHdl class TIestimatorMixin: + def test_get_delta_f(self, X_delta_f): dHdl, E, dE = X_delta_f est = self.cls().fit(dHdl) @@ -127,5 +136,3 @@ class TestTI(TIestimatorMixin): def X_delta_f(self, request): get_dHdl, E, dE = request.param return get_dHdl(), E, dE - -