Add function to combine H5 dat files

This function also handles overlapping times by taking the "latest" data always.
sxs-collaboration · Sep 19, 2024 · 7c0a988 · 7c0a988
1 parent fdb6f98
commit 7c0a988
Show file tree

Hide file tree

Showing 4 changed files with 454 additions and 1 deletion.
diff --git a/src/IO/H5/CombineH5.cpp b/src/IO/H5/CombineH5.cpp
@@ -3,23 +3,31 @@
 
 #include "IO/H5/CombineH5.hpp"
 
-#include <boost/program_options.hpp>
+#include <array>
 #include <cstddef>
 #include <cstdlib>
 #include <iterator>
+#include <optional>
+#include <sstream>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #include "DataStructures/DataVector.hpp"
+#include "DataStructures/Matrix.hpp"
 #include "IO/H5/AccessType.hpp"
 #include "IO/H5/CheckH5PropertiesMatch.hpp"
+#include "IO/H5/Dat.hpp"
 #include "IO/H5/File.hpp"
 #include "IO/H5/SourceArchive.hpp"
 #include "IO/H5/TensorData.hpp"
 #include "IO/H5/VolumeData.hpp"
+#include "IO/Logging/Verbosity.hpp"
 #include "Parallel/Printf/Printf.hpp"
+#include "Utilities/ErrorHandling/Error.hpp"
 #include "Utilities/FileSystem.hpp"
 #include "Utilities/MakeString.hpp"
+#include "Utilities/Numeric.hpp"
 #include "Utilities/StdHelpers.hpp"
 
 namespace {
@@ -149,4 +157,186 @@ void combine_h5_vol(const std::vector<std::string>& file_names,
     new_file.close_current_object();
   }
 }
+
+void combine_h5_dat(const std::vector<std::string>& h5_files_to_combine,
+                    const std::string& output_h5_filename,
+                    const Verbosity verbosity) {
+  if (h5_files_to_combine.empty()) {
+    ERROR_NO_TRACE("No H5 files to combine!");
+  }
+
+  std::vector<std::string> subfile_dat_names{};
+  {
+    const h5::H5File<h5::AccessType::ReadOnly> file_to_combine{
+        h5_files_to_combine[0]};
+    subfile_dat_names = file_to_combine.all_files<h5::Dat>("/");
+
+    if (subfile_dat_names.empty()) {
+      ERROR_NO_TRACE("No dat files in H5 file " << h5_files_to_combine[0]
+                                                << "to combine!");
+    }
+  }
+
+  if (verbosity >= Verbosity::Quiet) {
+    Parallel::printf("Combining all dat files from %s into %s\n",
+                     h5_files_to_combine, output_h5_filename);
+  }
+
+  // We just copy if there's only 1 file. We could change this behavior to
+  // error, but that's less versatile when using globs
+  if (h5_files_to_combine.size() == 1) {
+    file_system::copy(h5_files_to_combine[0], output_h5_filename);
+    if (verbosity >= Verbosity::Quiet) {
+      Parallel::printf("Done!\n");
+    }
+    return;
+  }
+
+  // For each dat subfile, this holds the number of times from each of the H5
+  // files to combine so that we always use the latest times. A nullopt means no
+  // times from that H5 file will be used. The std::vector should be the same
+  // length as the number of H5 files to combine
+  std::unordered_map<std::string, std::vector<std::optional<size_t>>>
+      num_time_map{};
+
+  // The outer loop is over dat files because we don't require different dat
+  // files to have the same times
+  for (const std::string& dat_filename : subfile_dat_names) {
+    num_time_map[dat_filename];
+    num_time_map.at(dat_filename).resize(h5_files_to_combine.size());
+
+    // The legend and version are sanity checks
+    std::optional<std::vector<std::string>> legend{};
+    std::optional<uint32_t> version{};
+    // Nullopt just means the first file we are looping over
+    std::optional<double> earliest_time{};
+    // We loop backwards to always ensure the "latest" time is used.
+    for (int i = static_cast<int>(h5_files_to_combine.size()) - 1; i >= 0;
+         i--) {
+      const auto index = static_cast<size_t>(i);
+      const std::string& filename = h5_files_to_combine[index];
+      const h5::H5File<h5::AccessType::ReadOnly> file_to_combine{filename};
+      const auto& dat_file = file_to_combine.get<h5::Dat>(dat_filename);
+      const auto dimensions = dat_file.get_dimensions();
+      if (not legend.has_value()) {
+        legend = dat_file.get_legend();
+        version = dat_file.get_version();
+      }
+
+      // Only grab the times for now
+      const Matrix times = dat_file.get_data_subset({0}, 0, dimensions[0]);
+
+      // Makes things easier below.
+      if (UNLIKELY(times.rows() == 0)) {
+        ERROR_NO_TRACE("No times in dat file " << dat_filename << " in H5 file "
+                                               << filename);
+      }
+      if (UNLIKELY(legend.value() != dat_file.get_legend())) {
+        ERROR_NO_TRACE("Legend of dat file "
+                       << dat_filename << " in H5 file " << filename
+                       << " doesn't match other H5 files.");
+      }
+      if (UNLIKELY(version.value() != dat_file.get_version())) {
+        ERROR_NO_TRACE("Version of dat file "
+                       << dat_filename << " in H5 file " << filename
+                       << " doesn't match other H5 files.");
+      }
+
+      // This is the first file. We can't make any decisions here so just store
+      // the number of times and the earliest time of this file
+      if (not earliest_time.has_value()) {
+        num_time_map.at(dat_filename)[index] = dimensions[0];
+        earliest_time = times(0, 0);
+        continue;
+      }
+
+      // Determine if the earliest time of the previous file (previous = later
+      // in the sequence of files since we are looping backward) is before
+      // any of the times in this file. If so, don't include those times.
+      std::optional<size_t> row = times.rows() - 1;
+      while (times(row.value(), 0) >= earliest_time.value()) {
+        // Once we get to the first time in this file, if it's still after the
+        // earliest time in the previous file, then we don't include any times
+        // from this file.
+        if (row.value() == 0) {
+          row.reset();
+          break;
+        }
+        row.value()--;
+      }
+
+      // So long as this file contains some times that need to be combined,
+      // store the number of times and the first time in this file.
+      if (row.has_value()) {
+        num_time_map.at(dat_filename)[index] = row.value() + 1;
+        earliest_time = times(0, 0);
+      }
+
+      file_to_combine.close_current_object();
+    }
+  }
+
+  if (verbosity >= Verbosity::Verbose) {
+    std::stringstream ss{};
+    ss << "Number of times selected to combine in each H5 file for each dat "
+          "subfile:\n";
+    for (const auto& [subfile_name, h5_files_num_times] : num_time_map) {
+      ss << " Dat Subfile " << subfile_name << ":\n";
+      for (size_t i = 0; i < h5_files_num_times.size(); i++) {
+        ss << "  H5 File " << h5_files_to_combine[i] << ": "
+           << h5_files_num_times[i].value_or(0) << "\n";
+      }
+    }
+
+    Parallel::printf("%s", ss.str());
+  }
+
+  // Now that we know the time indices for each dat file for each H5 file to
+  // combine, we open the output file and combine everything
+  h5::H5File<h5::AccessType::ReadWrite> output_h5_file{output_h5_filename};
+
+  const auto make_these_columns = [](const size_t size) -> std::vector<size_t> {
+    std::vector<size_t> these_columns{};
+    these_columns.resize(size);
+    alg::iota(these_columns, 0_st);
+    return these_columns;
+  };
+
+  // Now we loop over H5 files first to avoid unnecessary filesystem access
+  for (size_t i = 0; i < h5_files_to_combine.size(); i++) {
+    const std::string& filename = h5_files_to_combine[i];
+    const h5::H5File<h5::AccessType::ReadOnly> file_to_combine{filename};
+    for (const std::string& dat_filename : subfile_dat_names) {
+      const auto& input_dat_file = file_to_combine.get<h5::Dat>(dat_filename);
+      const std::vector<std::string>& legend = input_dat_file.get_legend();
+
+      // Avoid copying the legend around if we don't have to
+      h5::Dat* output_dat_file = nullptr;
+      if (output_h5_file.exists<h5::Dat>(dat_filename)) {
+        output_dat_file = &output_h5_file.get<h5::Dat>(dat_filename);
+      } else {
+        const uint32_t version = input_dat_file.get_version();
+        output_dat_file =
+            &output_h5_file.insert<h5::Dat>(dat_filename, legend, version);
+      }
+
+      const std::optional<size_t>& num_times = num_time_map.at(dat_filename)[i];
+
+      // Only append data if we include data from this file
+      if (num_times.has_value()) {
+        // Always start with row 0
+        const Matrix data_to_append = input_dat_file.get_data_subset(
+            make_these_columns(legend.size()), 0, num_times.value());
+        output_dat_file->append(data_to_append);
+      }
+
+      file_to_combine.close_current_object();
+      output_h5_file.close_current_object();
+    }
+  }
+
+  if (verbosity >= Verbosity::Quiet) {
+    Parallel::printf("Done!\n");
+  }
+}
 }  // namespace h5
diff --git a/src/IO/H5/CombineH5.hpp b/src/IO/H5/CombineH5.hpp
@@ -6,8 +6,44 @@
 #include <string>
 #include <vector>
 
+#include "IO/Logging/Verbosity.hpp"
+
 namespace h5 {
 void combine_h5_vol(const std::vector<std::string>& file_names,
                     const std::string& subfile_name, const std::string& output,
                     bool check_src = true);
+
+/*!
+ * \brief Combine the `h5::Dat` subfiles of multiple `h5::H5File`s into a single
+ * H5 file.
+ *
+ * \details If there are overlapping times, the "latest" one is always used;
+ * meaning if you have data in `File1.h5` and `File2.h5` and if the first time
+ * in `File2.h5` is before some times in `File1.h5`, those times in `File1.h5`
+ * will be discarded and won't appear in the combined H5 file.
+ *
+ * If the H5 files in \p h5_files_to_combine have other types of subfiles, those
+ * will be ignored and will not appear in \p output_h5_filename. This function
+ * also assumes that the times in each of the \p h5_files_to_combine are already
+ * sorted.
+ *
+ * If \p h5_files_to_combine is empty, an error will occur.
+ *
+ * If there are no `h5::Dat` files in the \p h5_files_to_combine, an error will
+ * occur.
+ *
+ * If the legend or version of an `h5::Dat` is not the same in all of
+ * \p h5_files_to_combine, an error will occur.
+ *
+ * \param h5_files_to_combine Vector of H5 files to combine. They must all have
+ * the same `h5::Dat` filenames, and those `h5::Dat` subfiles must have the same
+ * legends and versions. If not, an error will occur.
+ * \param output_h5_filename Name of the combined H5 file. The `h5::Dat` subfile
+ * structure will be identical to the ones in \p h5_files_to_combine.
+ * \param verbosity Controls how much is printed to stdout. Defaults to no
+ * `Verbosity::Silent` or no output.
+ */
+void combine_h5_dat(const std::vector<std::string>& h5_files_to_combine,
+                    const std::string& output_h5_filename,
+                    Verbosity verbosity = Verbosity::Silent);
 }  // namespace h5
diff --git a/tests/Unit/IO/H5/CMakeLists.txt b/tests/Unit/IO/H5/CMakeLists.txt
@@ -6,6 +6,7 @@ set(LIBRARY "Test_H5")
 set(LIBRARY_SOURCES
   Test_Cce.cpp
   Test_CheckH5PropertiesMatch.cpp
+  Test_CombineH5.cpp
   Test_Dat.cpp
   Test_EosTable.cpp
   Test_H5.cpp