Skip to content

Commit

Permalink
Add function to combine H5 dat files
Browse files Browse the repository at this point in the history
This function also handles overlapping times by taking the "latest"
data always.
  • Loading branch information
knelli2 committed Sep 19, 2024
1 parent fdb6f98 commit 7c0a988
Show file tree
Hide file tree
Showing 4 changed files with 454 additions and 1 deletion.
192 changes: 191 additions & 1 deletion src/IO/H5/CombineH5.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,31 @@

#include "IO/H5/CombineH5.hpp"

#include <boost/program_options.hpp>
#include <array>
#include <cstddef>
#include <cstdlib>
#include <iterator>
#include <optional>
#include <sstream>
#include <string>
#include <unordered_map>
#include <vector>

#include "DataStructures/DataVector.hpp"
#include "DataStructures/Matrix.hpp"
#include "IO/H5/AccessType.hpp"
#include "IO/H5/CheckH5PropertiesMatch.hpp"
#include "IO/H5/Dat.hpp"
#include "IO/H5/File.hpp"
#include "IO/H5/SourceArchive.hpp"
#include "IO/H5/TensorData.hpp"
#include "IO/H5/VolumeData.hpp"
#include "IO/Logging/Verbosity.hpp"
#include "Parallel/Printf/Printf.hpp"
#include "Utilities/ErrorHandling/Error.hpp"
#include "Utilities/FileSystem.hpp"
#include "Utilities/MakeString.hpp"
#include "Utilities/Numeric.hpp"
#include "Utilities/StdHelpers.hpp"

namespace {
Expand Down Expand Up @@ -149,4 +157,186 @@ void combine_h5_vol(const std::vector<std::string>& file_names,
new_file.close_current_object();
}
}

void combine_h5_dat(const std::vector<std::string>& h5_files_to_combine,
const std::string& output_h5_filename,
const Verbosity verbosity) {
if (h5_files_to_combine.empty()) {
ERROR_NO_TRACE("No H5 files to combine!");
}

std::vector<std::string> subfile_dat_names{};
{
const h5::H5File<h5::AccessType::ReadOnly> file_to_combine{
h5_files_to_combine[0]};
subfile_dat_names = file_to_combine.all_files<h5::Dat>("/");

if (subfile_dat_names.empty()) {
ERROR_NO_TRACE("No dat files in H5 file " << h5_files_to_combine[0]
<< "to combine!");
}
}

if (verbosity >= Verbosity::Quiet) {
Parallel::printf("Combining all dat files from %s into %s\n",
h5_files_to_combine, output_h5_filename);
}

// We just copy if there's only 1 file. We could change this behavior to
// error, but that's less versatile when using globs
if (h5_files_to_combine.size() == 1) {
file_system::copy(h5_files_to_combine[0], output_h5_filename);
if (verbosity >= Verbosity::Quiet) {
Parallel::printf("Done!\n");
}
return;
}

// For each dat subfile, this holds the number of times from each of the H5
// files to combine so that we always use the latest times. A nullopt means no
// times from that H5 file will be used. The std::vector should be the same
// length as the number of H5 files to combine
std::unordered_map<std::string, std::vector<std::optional<size_t>>>
num_time_map{};

// The outer loop is over dat files because we don't require different dat
// files to have the same times
for (const std::string& dat_filename : subfile_dat_names) {
num_time_map[dat_filename];
num_time_map.at(dat_filename).resize(h5_files_to_combine.size());

// The legend and version are sanity checks
std::optional<std::vector<std::string>> legend{};
std::optional<uint32_t> version{};
// Nullopt just means the first file we are looping over
std::optional<double> earliest_time{};
// We loop backwards to always ensure the "latest" time is used.
for (int i = static_cast<int>(h5_files_to_combine.size()) - 1; i >= 0;
i--) {
const auto index = static_cast<size_t>(i);
const std::string& filename = h5_files_to_combine[index];
const h5::H5File<h5::AccessType::ReadOnly> file_to_combine{filename};
const auto& dat_file = file_to_combine.get<h5::Dat>(dat_filename);
const auto dimensions = dat_file.get_dimensions();
if (not legend.has_value()) {
legend = dat_file.get_legend();
version = dat_file.get_version();
}

// Only grab the times for now
const Matrix times = dat_file.get_data_subset({0}, 0, dimensions[0]);

// Makes things easier below.
if (UNLIKELY(times.rows() == 0)) {
ERROR_NO_TRACE("No times in dat file " << dat_filename << " in H5 file "
<< filename);
}
if (UNLIKELY(legend.value() != dat_file.get_legend())) {
ERROR_NO_TRACE("Legend of dat file "
<< dat_filename << " in H5 file " << filename
<< " doesn't match other H5 files.");
}
if (UNLIKELY(version.value() != dat_file.get_version())) {
ERROR_NO_TRACE("Version of dat file "
<< dat_filename << " in H5 file " << filename
<< " doesn't match other H5 files.");
}

// This is the first file. We can't make any decisions here so just store
// the number of times and the earliest time of this file
if (not earliest_time.has_value()) {
num_time_map.at(dat_filename)[index] = dimensions[0];
earliest_time = times(0, 0);
continue;
}

// Determine if the earliest time of the previous file (previous = later
// in the sequence of files since we are looping backward) is before
// any of the times in this file. If so, don't include those times.
std::optional<size_t> row = times.rows() - 1;
while (times(row.value(), 0) >= earliest_time.value()) {
// Once we get to the first time in this file, if it's still after the
// earliest time in the previous file, then we don't include any times
// from this file.
if (row.value() == 0) {
row.reset();
break;
}
row.value()--;
}

// So long as this file contains some times that need to be combined,
// store the number of times and the first time in this file.
if (row.has_value()) {
num_time_map.at(dat_filename)[index] = row.value() + 1;
earliest_time = times(0, 0);
}

file_to_combine.close_current_object();
}
}

if (verbosity >= Verbosity::Verbose) {
std::stringstream ss{};
ss << "Number of times selected to combine in each H5 file for each dat "
"subfile:\n";
for (const auto& [subfile_name, h5_files_num_times] : num_time_map) {
ss << " Dat Subfile " << subfile_name << ":\n";
for (size_t i = 0; i < h5_files_num_times.size(); i++) {
ss << " H5 File " << h5_files_to_combine[i] << ": "
<< h5_files_num_times[i].value_or(0) << "\n";
}
}

Parallel::printf("%s", ss.str());
}

// Now that we know the time indices for each dat file for each H5 file to
// combine, we open the output file and combine everything
h5::H5File<h5::AccessType::ReadWrite> output_h5_file{output_h5_filename};

const auto make_these_columns = [](const size_t size) -> std::vector<size_t> {
std::vector<size_t> these_columns{};
these_columns.resize(size);
alg::iota(these_columns, 0_st);
return these_columns;
};

// Now we loop over H5 files first to avoid unnecessary filesystem access
for (size_t i = 0; i < h5_files_to_combine.size(); i++) {
const std::string& filename = h5_files_to_combine[i];
const h5::H5File<h5::AccessType::ReadOnly> file_to_combine{filename};
for (const std::string& dat_filename : subfile_dat_names) {
const auto& input_dat_file = file_to_combine.get<h5::Dat>(dat_filename);
const std::vector<std::string>& legend = input_dat_file.get_legend();

// Avoid copying the legend around if we don't have to
h5::Dat* output_dat_file = nullptr;
if (output_h5_file.exists<h5::Dat>(dat_filename)) {
output_dat_file = &output_h5_file.get<h5::Dat>(dat_filename);
} else {
const uint32_t version = input_dat_file.get_version();
output_dat_file =
&output_h5_file.insert<h5::Dat>(dat_filename, legend, version);
}

const std::optional<size_t>& num_times = num_time_map.at(dat_filename)[i];

// Only append data if we include data from this file
if (num_times.has_value()) {
// Always start with row 0
const Matrix data_to_append = input_dat_file.get_data_subset(
make_these_columns(legend.size()), 0, num_times.value());
output_dat_file->append(data_to_append);
}

file_to_combine.close_current_object();
output_h5_file.close_current_object();
}
}

if (verbosity >= Verbosity::Quiet) {
Parallel::printf("Done!\n");
}
}
} // namespace h5
36 changes: 36 additions & 0 deletions src/IO/H5/CombineH5.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,44 @@
#include <string>
#include <vector>

#include "IO/Logging/Verbosity.hpp"

namespace h5 {
void combine_h5_vol(const std::vector<std::string>& file_names,
const std::string& subfile_name, const std::string& output,
bool check_src = true);

/*!
* \brief Combine the `h5::Dat` subfiles of multiple `h5::H5File`s into a single
* H5 file.
*
* \details If there are overlapping times, the "latest" one is always used;
* meaning if you have data in `File1.h5` and `File2.h5` and if the first time
* in `File2.h5` is before some times in `File1.h5`, those times in `File1.h5`
* will be discarded and won't appear in the combined H5 file.
*
* If the H5 files in \p h5_files_to_combine have other types of subfiles, those
* will be ignored and will not appear in \p output_h5_filename. This function
* also assumes that the times in each of the \p h5_files_to_combine are already
* sorted.
*
* If \p h5_files_to_combine is empty, an error will occur.
*
* If there are no `h5::Dat` files in the \p h5_files_to_combine, an error will
* occur.
*
* If the legend or version of an `h5::Dat` is not the same in all of
* \p h5_files_to_combine, an error will occur.
*
* \param h5_files_to_combine Vector of H5 files to combine. They must all have
* the same `h5::Dat` filenames, and those `h5::Dat` subfiles must have the same
* legends and versions. If not, an error will occur.
* \param output_h5_filename Name of the combined H5 file. The `h5::Dat` subfile
* structure will be identical to the ones in \p h5_files_to_combine.
* \param verbosity Controls how much is printed to stdout. Defaults to no
* `Verbosity::Silent` or no output.
*/
void combine_h5_dat(const std::vector<std::string>& h5_files_to_combine,
const std::string& output_h5_filename,
Verbosity verbosity = Verbosity::Silent);
} // namespace h5
1 change: 1 addition & 0 deletions tests/Unit/IO/H5/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ set(LIBRARY "Test_H5")
set(LIBRARY_SOURCES
Test_Cce.cpp
Test_CheckH5PropertiesMatch.cpp
Test_CombineH5.cpp
Test_Dat.cpp
Test_EosTable.cpp
Test_H5.cpp
Expand Down
Loading

0 comments on commit 7c0a988

Please sign in to comment.