From 7c0a98808c1a38bb49fc7a61b83cd6c504b40a00 Mon Sep 17 00:00:00 2001 From: Kyle Nelli Date: Wed, 18 Sep 2024 16:21:09 -0700 Subject: [PATCH] Add function to combine H5 dat files This function also handles overlapping times by taking the "latest" data always. --- src/IO/H5/CombineH5.cpp | 192 ++++++++++++++++++++++- src/IO/H5/CombineH5.hpp | 36 +++++ tests/Unit/IO/H5/CMakeLists.txt | 1 + tests/Unit/IO/H5/Test_CombineH5.cpp | 226 ++++++++++++++++++++++++++++ 4 files changed, 454 insertions(+), 1 deletion(-) create mode 100644 tests/Unit/IO/H5/Test_CombineH5.cpp diff --git a/src/IO/H5/CombineH5.cpp b/src/IO/H5/CombineH5.cpp index cbfdd3293112..67ccc9c8378d 100644 --- a/src/IO/H5/CombineH5.cpp +++ b/src/IO/H5/CombineH5.cpp @@ -3,23 +3,31 @@ #include "IO/H5/CombineH5.hpp" -#include +#include #include #include #include +#include +#include #include +#include #include #include "DataStructures/DataVector.hpp" +#include "DataStructures/Matrix.hpp" #include "IO/H5/AccessType.hpp" #include "IO/H5/CheckH5PropertiesMatch.hpp" +#include "IO/H5/Dat.hpp" #include "IO/H5/File.hpp" #include "IO/H5/SourceArchive.hpp" #include "IO/H5/TensorData.hpp" #include "IO/H5/VolumeData.hpp" +#include "IO/Logging/Verbosity.hpp" #include "Parallel/Printf/Printf.hpp" +#include "Utilities/ErrorHandling/Error.hpp" #include "Utilities/FileSystem.hpp" #include "Utilities/MakeString.hpp" +#include "Utilities/Numeric.hpp" #include "Utilities/StdHelpers.hpp" namespace { @@ -149,4 +157,186 @@ void combine_h5_vol(const std::vector& file_names, new_file.close_current_object(); } } + +void combine_h5_dat(const std::vector& h5_files_to_combine, + const std::string& output_h5_filename, + const Verbosity verbosity) { + if (h5_files_to_combine.empty()) { + ERROR_NO_TRACE("No H5 files to combine!"); + } + + std::vector subfile_dat_names{}; + { + const h5::H5File file_to_combine{ + h5_files_to_combine[0]}; + subfile_dat_names = file_to_combine.all_files("/"); + + if (subfile_dat_names.empty()) { + ERROR_NO_TRACE("No dat files in H5 file " << h5_files_to_combine[0] + << "to combine!"); + } + } + + if (verbosity >= Verbosity::Quiet) { + Parallel::printf("Combining all dat files from %s into %s\n", + h5_files_to_combine, output_h5_filename); + } + + // We just copy if there's only 1 file. We could change this behavior to + // error, but that's less versatile when using globs + if (h5_files_to_combine.size() == 1) { + file_system::copy(h5_files_to_combine[0], output_h5_filename); + if (verbosity >= Verbosity::Quiet) { + Parallel::printf("Done!\n"); + } + return; + } + + // For each dat subfile, this holds the number of times from each of the H5 + // files to combine so that we always use the latest times. A nullopt means no + // times from that H5 file will be used. The std::vector should be the same + // length as the number of H5 files to combine + std::unordered_map>> + num_time_map{}; + + // The outer loop is over dat files because we don't require different dat + // files to have the same times + for (const std::string& dat_filename : subfile_dat_names) { + num_time_map[dat_filename]; + num_time_map.at(dat_filename).resize(h5_files_to_combine.size()); + + // The legend and version are sanity checks + std::optional> legend{}; + std::optional version{}; + // Nullopt just means the first file we are looping over + std::optional earliest_time{}; + // We loop backwards to always ensure the "latest" time is used. + for (int i = static_cast(h5_files_to_combine.size()) - 1; i >= 0; + i--) { + const auto index = static_cast(i); + const std::string& filename = h5_files_to_combine[index]; + const h5::H5File file_to_combine{filename}; + const auto& dat_file = file_to_combine.get(dat_filename); + const auto dimensions = dat_file.get_dimensions(); + if (not legend.has_value()) { + legend = dat_file.get_legend(); + version = dat_file.get_version(); + } + + // Only grab the times for now + const Matrix times = dat_file.get_data_subset({0}, 0, dimensions[0]); + + // Makes things easier below. + if (UNLIKELY(times.rows() == 0)) { + ERROR_NO_TRACE("No times in dat file " << dat_filename << " in H5 file " + << filename); + } + if (UNLIKELY(legend.value() != dat_file.get_legend())) { + ERROR_NO_TRACE("Legend of dat file " + << dat_filename << " in H5 file " << filename + << " doesn't match other H5 files."); + } + if (UNLIKELY(version.value() != dat_file.get_version())) { + ERROR_NO_TRACE("Version of dat file " + << dat_filename << " in H5 file " << filename + << " doesn't match other H5 files."); + } + + // This is the first file. We can't make any decisions here so just store + // the number of times and the earliest time of this file + if (not earliest_time.has_value()) { + num_time_map.at(dat_filename)[index] = dimensions[0]; + earliest_time = times(0, 0); + continue; + } + + // Determine if the earliest time of the previous file (previous = later + // in the sequence of files since we are looping backward) is before + // any of the times in this file. If so, don't include those times. + std::optional row = times.rows() - 1; + while (times(row.value(), 0) >= earliest_time.value()) { + // Once we get to the first time in this file, if it's still after the + // earliest time in the previous file, then we don't include any times + // from this file. + if (row.value() == 0) { + row.reset(); + break; + } + row.value()--; + } + + // So long as this file contains some times that need to be combined, + // store the number of times and the first time in this file. + if (row.has_value()) { + num_time_map.at(dat_filename)[index] = row.value() + 1; + earliest_time = times(0, 0); + } + + file_to_combine.close_current_object(); + } + } + + if (verbosity >= Verbosity::Verbose) { + std::stringstream ss{}; + ss << "Number of times selected to combine in each H5 file for each dat " + "subfile:\n"; + for (const auto& [subfile_name, h5_files_num_times] : num_time_map) { + ss << " Dat Subfile " << subfile_name << ":\n"; + for (size_t i = 0; i < h5_files_num_times.size(); i++) { + ss << " H5 File " << h5_files_to_combine[i] << ": " + << h5_files_num_times[i].value_or(0) << "\n"; + } + } + + Parallel::printf("%s", ss.str()); + } + + // Now that we know the time indices for each dat file for each H5 file to + // combine, we open the output file and combine everything + h5::H5File output_h5_file{output_h5_filename}; + + const auto make_these_columns = [](const size_t size) -> std::vector { + std::vector these_columns{}; + these_columns.resize(size); + alg::iota(these_columns, 0_st); + return these_columns; + }; + + // Now we loop over H5 files first to avoid unnecessary filesystem access + for (size_t i = 0; i < h5_files_to_combine.size(); i++) { + const std::string& filename = h5_files_to_combine[i]; + const h5::H5File file_to_combine{filename}; + for (const std::string& dat_filename : subfile_dat_names) { + const auto& input_dat_file = file_to_combine.get(dat_filename); + const std::vector& legend = input_dat_file.get_legend(); + + // Avoid copying the legend around if we don't have to + h5::Dat* output_dat_file = nullptr; + if (output_h5_file.exists(dat_filename)) { + output_dat_file = &output_h5_file.get(dat_filename); + } else { + const uint32_t version = input_dat_file.get_version(); + output_dat_file = + &output_h5_file.insert(dat_filename, legend, version); + } + + const std::optional& num_times = num_time_map.at(dat_filename)[i]; + + // Only append data if we include data from this file + if (num_times.has_value()) { + // Always start with row 0 + const Matrix data_to_append = input_dat_file.get_data_subset( + make_these_columns(legend.size()), 0, num_times.value()); + output_dat_file->append(data_to_append); + } + + file_to_combine.close_current_object(); + output_h5_file.close_current_object(); + } + } + + if (verbosity >= Verbosity::Quiet) { + Parallel::printf("Done!\n"); + } +} } // namespace h5 diff --git a/src/IO/H5/CombineH5.hpp b/src/IO/H5/CombineH5.hpp index 85e1e6504e51..0e28d660acc6 100644 --- a/src/IO/H5/CombineH5.hpp +++ b/src/IO/H5/CombineH5.hpp @@ -6,8 +6,44 @@ #include #include +#include "IO/Logging/Verbosity.hpp" + namespace h5 { void combine_h5_vol(const std::vector& file_names, const std::string& subfile_name, const std::string& output, bool check_src = true); + +/*! + * \brief Combine the `h5::Dat` subfiles of multiple `h5::H5File`s into a single + * H5 file. + * + * \details If there are overlapping times, the "latest" one is always used; + * meaning if you have data in `File1.h5` and `File2.h5` and if the first time + * in `File2.h5` is before some times in `File1.h5`, those times in `File1.h5` + * will be discarded and won't appear in the combined H5 file. + * + * If the H5 files in \p h5_files_to_combine have other types of subfiles, those + * will be ignored and will not appear in \p output_h5_filename. This function + * also assumes that the times in each of the \p h5_files_to_combine are already + * sorted. + * + * If \p h5_files_to_combine is empty, an error will occur. + * + * If there are no `h5::Dat` files in the \p h5_files_to_combine, an error will + * occur. + * + * If the legend or version of an `h5::Dat` is not the same in all of + * \p h5_files_to_combine, an error will occur. + * + * \param h5_files_to_combine Vector of H5 files to combine. They must all have + * the same `h5::Dat` filenames, and those `h5::Dat` subfiles must have the same + * legends and versions. If not, an error will occur. + * \param output_h5_filename Name of the combined H5 file. The `h5::Dat` subfile + * structure will be identical to the ones in \p h5_files_to_combine. + * \param verbosity Controls how much is printed to stdout. Defaults to no + * `Verbosity::Silent` or no output. + */ +void combine_h5_dat(const std::vector& h5_files_to_combine, + const std::string& output_h5_filename, + Verbosity verbosity = Verbosity::Silent); } // namespace h5 diff --git a/tests/Unit/IO/H5/CMakeLists.txt b/tests/Unit/IO/H5/CMakeLists.txt index 45282f8c47d6..1202ee009815 100644 --- a/tests/Unit/IO/H5/CMakeLists.txt +++ b/tests/Unit/IO/H5/CMakeLists.txt @@ -6,6 +6,7 @@ set(LIBRARY "Test_H5") set(LIBRARY_SOURCES Test_Cce.cpp Test_CheckH5PropertiesMatch.cpp + Test_CombineH5.cpp Test_Dat.cpp Test_EosTable.cpp Test_H5.cpp diff --git a/tests/Unit/IO/H5/Test_CombineH5.cpp b/tests/Unit/IO/H5/Test_CombineH5.cpp new file mode 100644 index 000000000000..087bd23f224c --- /dev/null +++ b/tests/Unit/IO/H5/Test_CombineH5.cpp @@ -0,0 +1,226 @@ +// Distributed under the MIT License. +// See LICENSE.txt for details. + +#include "Framework/TestingFramework.hpp" + +#include +#include + +#include "DataStructures/Matrix.hpp" +#include "IO/H5/AccessType.hpp" +#include "IO/H5/Cce.hpp" +#include "IO/H5/CombineH5.hpp" +#include "IO/H5/Dat.hpp" +#include "IO/H5/File.hpp" +#include "IO/Logging/Verbosity.hpp" +#include "Utilities/FileSystem.hpp" + +namespace { +void test_single_file() { + const std::string combined_filename{"CombinedSingle.h5"}; + const std::string original_filename{"OriginalFileName.h5"}; + if (file_system::check_if_file_exists(combined_filename)) { + file_system::rm(combined_filename, true); + } + if (file_system::check_if_file_exists(original_filename)) { + file_system::rm(original_filename, true); + } + + const std::string subfile_name{"SubfileName"}; + const Matrix data{{0.0, 0.0}, {1.0, 1.0}, {2.0, 2.0}, {3.0, 3.0}}; + const std::vector legend{"Time", "Data"}; + const uint32_t version = 4; + + { + h5::H5File h5_file{original_filename}; + auto& dat_file = h5_file.insert(subfile_name, legend, version); + dat_file.append(data); + } + + h5::combine_h5_dat({original_filename}, combined_filename, Verbosity::Quiet); + + { + const h5::H5File h5_file{combined_filename}; + const auto& dat_file = h5_file.get(subfile_name); + CHECK(dat_file.get_version() == version); + CHECK(dat_file.get_legend() == legend); + CHECK(dat_file.get_data() == data); + } +} + +void test() { + const std::string combined_filename{"MightyMorphinPowerRangers.h5"}; + if (file_system::check_if_file_exists(combined_filename)) { + file_system::rm(combined_filename, true); + } + const std::vector individual_filenames{ + "RedRanger.h5", "BlackRanger.h5", "BlueRanger.h5", "YellowRanger.h5", + "PinkRanger.h5"}; + const std::vector subfile_names{"Subfile1", "Subfile2"}; + // All subfiles can just share the same legend. The data doesn't matter, only + // the times for this test. + const std::vector legend{"Time", "Data"}; + const std::vector>> data{ + // This file doesn't keep the last time + {{0.0, 0.0}, {1.0, 1.0}, {2.0, 0.0}}, + // This file keeps all times + {{1.5, 2.0}, {2.1, 3.0}}, + // This file is completely discarded because the next files' first time is + // the same as this first time + {{3.0, 0.0}, {4.0, 0.0}, {5.0, 0.0}}, + // This file only keeps the first time + {{3.0, 4.0}, {4.1, 0.0}, {4.5, 0.0}}, + // This file keeps all times + {{4.0, 5.0}, {5.5, 6.0}, {6.0, 7.0}}}; + + const Matrix expected_data{{0.0, 0.0}, {1.0, 1.0}, {1.5, 2.0}, {2.1, 3.0}, + {3.0, 4.0}, {4.0, 5.0}, {5.5, 6.0}, {6.0, 7.0}}; + + // Write the individual files + { + for (size_t i = 0; i < individual_filenames.size(); i++) { + const std::string& filename = individual_filenames[i]; + if (file_system::check_if_file_exists(filename)) { + file_system::rm(filename, true); + } + h5::H5File h5_file{filename}; + for (const std::string& subfile_name : subfile_names) { + auto& dat_file = h5_file.insert(subfile_name, legend); + dat_file.append(data[i]); + h5_file.close_current_object(); + } + } + } + + // Combine the H5 files + h5::combine_h5_dat(individual_filenames, combined_filename, + Verbosity::Verbose); + + REQUIRE(file_system::check_if_file_exists(combined_filename)); + + { + const h5::H5File h5_file{combined_filename}; + for (const std::string& subfile_name : subfile_names) { + CAPTURE(subfile_name); + const auto& dat_file = h5_file.get(subfile_name); + const Matrix dat_data = dat_file.get_data(); + CHECK(expected_data == dat_data); + h5_file.close_current_object(); + } + } + + if (file_system::check_if_file_exists(combined_filename)) { + file_system::rm(combined_filename, true); + } + + for (const std::string& filename : individual_filenames) { + if (file_system::check_if_file_exists(filename)) { + file_system::rm(filename, true); + } + } +} + +void test_errors() { + const std::string fake_file{"FakeFile.h5"}; + const std::string error_filename_1{"CombineH5Error1.h5"}; + const std::string error_filename_2{"CombineH5Error2.h5"}; + CHECK_THROWS_WITH( + h5::combine_h5_dat({}, fake_file), + Catch::Matchers::ContainsSubstring("No H5 files to combine!")); + + const auto delete_files = [&]() { + if (file_system::check_if_file_exists(error_filename_1)) { + file_system::rm(error_filename_1, true); + } + if (file_system::check_if_file_exists(error_filename_2)) { + file_system::rm(error_filename_2, true); + } + if (file_system::check_if_file_exists(fake_file)) { + file_system::rm(fake_file, true); + } + }; + + delete_files(); + + { + INFO("No dat files"); + { + h5::H5File h5_file{error_filename_1}; + h5_file.insert("CceSubfile", 4); + } + CHECK_THROWS_WITH( + h5::combine_h5_dat({error_filename_1}, fake_file), + Catch::Matchers::ContainsSubstring("No dat files in H5 file")); + } + + { + INFO("No times in dat file"); + { + h5::H5File h5_file{error_filename_1, true}; + h5_file.insert("DatSubfile", + std::vector{"Time", "Blah"}); + } + { + h5::H5File h5_file{error_filename_2, true}; + h5_file.insert("DatSubfile", + std::vector{"Time", "Blah"}); + } + CHECK_THROWS_WITH( + h5::combine_h5_dat({error_filename_1, error_filename_2}, fake_file), + Catch::Matchers::ContainsSubstring("No times in dat file")); + } + + delete_files(); + + { + INFO("Legends don't match"); + { + h5::H5File h5_file{error_filename_1, true}; + auto& dat_file = h5_file.try_insert( + "DatSubfile", std::vector{"Time", "Blah"}); + dat_file.append(std::vector{0.0, 0.0}); + } + { + h5::H5File h5_file{error_filename_2, true}; + auto& dat_file = h5_file.try_insert( + "DatSubfile", std::vector{"Time", "DifferentBlah"}); + dat_file.append(std::vector{0.0, 0.0}); + } + CHECK_THROWS_WITH( + h5::combine_h5_dat({error_filename_1, error_filename_2}, fake_file), + Catch::Matchers::ContainsSubstring("Legend of dat file") and + Catch::Matchers::ContainsSubstring("doesn't match other H5 files")); + } + + delete_files(); + + { + INFO("Versions don't match"); + { + h5::H5File h5_file{error_filename_1, true}; + auto& dat_file = h5_file.try_insert( + "DatSubfile", std::vector{"Time", "Blah"}, 0); + dat_file.append(std::vector{0.0, 0.0}); + } + { + h5::H5File h5_file{error_filename_2, true}; + auto& dat_file = h5_file.try_insert( + "DatSubfile", std::vector{"Time", "Blah"}, 1); + dat_file.append(std::vector{0.0, 0.0}); + } + CHECK_THROWS_WITH( + h5::combine_h5_dat({error_filename_1, error_filename_2}, fake_file), + Catch::Matchers::ContainsSubstring("Version of dat file") and + Catch::Matchers::ContainsSubstring("doesn't match other H5 files")); + } + + delete_files(); +} +} // namespace + +// [TimeOut, 15] +SPECTRE_TEST_CASE("Unit.IO.H5.CombineH5", "[Unit][IO][H5]") { + test_single_file(); + test_errors(); + test(); +}