fixup. Combine H5 dat

sxs-collaboration · Sep 27, 2024 · a67d874 · a67d874
1 parent e9f4bcf
commit a67d874
Show file tree

Hide file tree

Showing 3 changed files with 93 additions and 60 deletions.
diff --git a/src/IO/H5/CombineH5.cpp b/src/IO/H5/CombineH5.cpp
@@ -30,8 +30,8 @@
 #include "Utilities/Algorithm.hpp"
 #include "Utilities/ErrorHandling/Error.hpp"
 #include "Utilities/FileSystem.hpp"
+#include "Utilities/Gsl.hpp"
 #include "Utilities/MakeString.hpp"
-#include "Utilities/Numeric.hpp"
 #include "Utilities/Serialization/Serialize.hpp"
 #include "Utilities/StdHelpers.hpp"
 
@@ -323,12 +323,10 @@ void combine_h5_dat(const std::vector<std::string>& h5_files_to_combine,
     return;
   }
 
-  // For each dat subfile, this holds the number of times from each of the H5
-  // files to combine so that we always use the latest times. A nullopt means no
-  // times from that H5 file will be used. The std::vector should be the same
-  // length as the number of H5 files to combine
-  std::unordered_map<std::string, std::vector<std::optional<size_t>>>
-      num_time_map{};
+  // For each dat subfile, this holds the number of *sorted* times from each of
+  // the H5 files to combine so that we always use the latest times. The
+  // std::vector should be the same length as the number of H5 files to combine
+  std::unordered_map<std::string, std::vector<size_t>> num_time_map{};
 
   // The outer loop is over dat files because we don't require different dat
   // files to have the same times
@@ -355,10 +353,14 @@ void combine_h5_dat(const std::vector<std::string>& h5_files_to_combine,
       }
 
       // Only grab the times for now
-      const Matrix times = dat_file.get_data_subset({0}, 0, dimensions[0]);
+      auto times = dat_file.get_data_subset<std::vector<std::vector<double>>>(
+          {0}, 0, dimensions[0]);
+      alg::sort(times,
+                [](const std::vector<double>& v1,
+                   const std::vector<double>& v2) { return v1[0] < v2[0]; });
 
       // Makes things easier below.
-      if (UNLIKELY(times.rows() == 0)) {
+      if (UNLIKELY(times.size() == 0)) {
         ERROR_NO_TRACE("No times in dat file " << dat_filename << " in H5 file "
                                                << filename);
       }
@@ -373,35 +375,43 @@ void combine_h5_dat(const std::vector<std::string>& h5_files_to_combine,
                        << " doesn't match other H5 files.");
       }
 
-      // This is the first file. We can't make any decisions here so just store
-      // the number of times and the earliest time of this file
+      // This is the first (last) file. We can't make any decisions here so just
+      // store the number of times and the earliest time of this file
       if (not earliest_time.has_value()) {
         num_time_map.at(dat_filename)[index] = dimensions[0];
-        earliest_time = times(0, 0);
+        earliest_time = times[0][0];
         continue;
       }
 
-      // Determine if the earliest time of the previous file (previous = later
-      // in the sequence of files since we are looping backward) is before
-      // any of the times in this file. If so, don't include those times.
-      std::optional<size_t> row = times.rows() - 1;
-      while (times(row.value(), 0) >= earliest_time.value()) {
-        // Once we get to the first time in this file, if it's still after the
-        // earliest time in the previous file, then we don't include any times
-        // from this file.
-        if (row.value() == 0) {
-          row.reset();
-          break;
+      // Check that the earliest time of the previous file (previous = later
+      // in the sequence of files since we are looping backward) is after the
+      // first time of this file. We require the files to be passed in
+      // increasing order by their first time.
+      if (UNLIKELY(times[0][0] >= earliest_time.value())) {
+        ERROR_NO_TRACE("The H5 files passed in "
+                       << h5_files_to_combine
+                       << " are not monotonically increasing in their first "
+                          "times for dat file "
+                       << dat_filename << "");
+      }
+
+      // Determine if the earliest time of the previous file is before any of
+      // the times in this file. If so, don't include those times.
+      size_t row = times.size() - 1;
+      while (times[row][0] >= earliest_time.value()) {
+        // This should have been taken care of before so we should never get
+        // here
+        if (UNLIKELY(row == 0)) {
+          ERROR_NO_TRACE("Internal consistency error. Please file an issue.");
         }
-        row.value()--;
+
+        row--;
       }
 
       // So long as this file contains some times that need to be combined,
       // store the number of times and the first time in this file.
-      if (row.has_value()) {
-        num_time_map.at(dat_filename)[index] = row.value() + 1;
-        earliest_time = times(0, 0);
-      }
+      num_time_map.at(dat_filename)[index] = row + 1;
+      earliest_time = times[0][0];
 
       file_to_combine.close_current_object();
     }
@@ -415,7 +425,7 @@ void combine_h5_dat(const std::vector<std::string>& h5_files_to_combine,
       ss << " Dat Subfile " << subfile_name << ":\n";
       for (size_t i = 0; i < h5_files_num_times.size(); i++) {
         ss << "  H5 File " << h5_files_to_combine[i] << ": "
-           << h5_files_num_times[i].value_or(0) << "\n";
+           << h5_files_num_times[i] << "\n";
       }
     }
 
@@ -426,13 +436,6 @@ void combine_h5_dat(const std::vector<std::string>& h5_files_to_combine,
   // combine, we open the output file and combine everything
   h5::H5File<h5::AccessType::ReadWrite> output_h5_file{output_h5_filename};
 
-  const auto make_these_columns = [](const size_t size) -> std::vector<size_t> {
-    std::vector<size_t> these_columns{};
-    these_columns.resize(size);
-    alg::iota(these_columns, 0_st);
-    return these_columns;
-  };
-
   // Now we loop over H5 files first to avoid unnecessary filesystem access
   for (size_t i = 0; i < h5_files_to_combine.size(); i++) {
     const std::string& filename = h5_files_to_combine[i];
@@ -451,15 +454,18 @@ void combine_h5_dat(const std::vector<std::string>& h5_files_to_combine,
             &output_h5_file.insert<h5::Dat>(dat_filename, legend, version);
       }
 
-      const std::optional<size_t>& num_times = num_time_map.at(dat_filename)[i];
+      const size_t num_times = num_time_map.at(dat_filename)[i];
 
-      // Only append data if we include data from this file
-      if (num_times.has_value()) {
-        // Always start with row 0
-        const Matrix data_to_append = input_dat_file.get_data_subset(
-            make_these_columns(legend.size()), 0, num_times.value());
-        output_dat_file->append(data_to_append);
-      }
+      // We must get all data first and sort it by times, because the number of
+      // times is only meaningful for the sorted data
+      auto data_to_append =
+          input_dat_file.get_data<std::vector<std::vector<double>>>();
+      alg::sort(data_to_append,
+                [](const std::vector<double>& v1,
+                   const std::vector<double>& v2) { return v1[0] < v2[0]; });
+      data_to_append.resize(num_times);
+
+      output_dat_file->append(data_to_append);
 
       file_to_combine.close_current_object();
       output_h5_file.close_current_object();

diff --git a/src/IO/H5/CombineH5.hpp b/src/IO/H5/CombineH5.hpp
@@ -29,15 +29,18 @@ void combine_h5_vol(const std::vector<std::string>& file_names,
  * \brief Combine the `h5::Dat` subfiles of multiple `h5::H5File`s into a single
  * H5 file.
  *
- * \details If there are overlapping times, the "latest" one is always used;
- * meaning if you have data in `File1.h5` and `File2.h5` and if the first time
- * in `File2.h5` is before some times in `File1.h5`, those times in `File1.h5`
- * will be discarded and won't appear in the combined H5 file.
+ * \details The times in each `h5::Dat` subfile can be unordered. The necessary
+ * sorting will be handled in this function. However, the \p h5_files_to_combine
+ * must be mononitcally increasing in time; meaning the earliest time in
+ * `File1.h5` must come before the earliest time in `File2.h5`.
+ *
+ * If there are overlapping times, the "latest" one is always used;
+ * meaning if you have data in `File1.h5` and `File2.h5` and if the earliest
+ * time in `File2.h5` is before some times in `File1.h5`, those times in
+ * `File1.h5` will be discarded and won't appear in the combined H5 file.
  *
  * If the H5 files in \p h5_files_to_combine have other types of subfiles, those
- * will be ignored and will not appear in \p output_h5_filename. This function
- * also assumes that the times in each of the \p h5_files_to_combine are already
- * sorted.
+ * will be ignored and will not appear in \p output_h5_filename.
  *
  * If \p h5_files_to_combine is empty, an error will occur.
  *

diff --git a/tests/Unit/IO/H5/Test_CombineH5.cpp b/tests/Unit/IO/H5/Test_CombineH5.cpp
@@ -46,6 +46,13 @@ void test_single_file() {
     CHECK(dat_file.get_legend() == legend);
     CHECK(dat_file.get_data() == data);
   }
+
+  if (file_system::check_if_file_exists(combined_filename)) {
+    file_system::rm(combined_filename, true);
+  }
+  if (file_system::check_if_file_exists(original_filename)) {
+    file_system::rm(original_filename, true);
+  }
 }
 
 void test() {
@@ -54,22 +61,18 @@ void test() {
     file_system::rm(combined_filename, true);
   }
   const std::vector<std::string> individual_filenames{
-      "RedRanger.h5", "BlackRanger.h5", "BlueRanger.h5", "YellowRanger.h5",
-      "PinkRanger.h5"};
+      "RedRanger.h5", "BlackRanger.h5", "BlueRanger.h5", "YellowRanger.h5"};
   const std::vector<std::string> subfile_names{"Subfile1", "Subfile2"};
   // All subfiles can just share the same legend. The data doesn't matter, only
   // the times for this test.
   const std::vector<std::string> legend{"Time", "Data"};
   const std::vector<std::vector<std::vector<double>>> data{
       // This file doesn't keep the last time
       {{0.0, 0.0}, {1.0, 1.0}, {2.0, 0.0}},
-      // This file keeps all times
-      {{1.5, 2.0}, {2.1, 3.0}},
-      // This file is completely discarded because the next files' first time is
-      // the same as this first time
-      {{3.0, 0.0}, {4.0, 0.0}, {5.0, 0.0}},
-      // This file only keeps the first time
-      {{3.0, 4.0}, {4.1, 0.0}, {4.5, 0.0}},
+      // This file keeps all times, but is unordered
+      {{2.1, 3.0}, {1.5, 2.0}},
+      // This file only keeps the earliest time, but is unordered
+      {{4.1, 0.0}, {3.0, 4.0}, {4.5, 0.0}},
       // This file keeps all times
       {{4.0, 5.0}, {5.5, 6.0}, {6.0, 7.0}}};
 
@@ -215,6 +218,27 @@ void test_errors() {
   }
 
   delete_files();
+
+  {
+    INFO("Non monotonically increasing H5 files");
+    {
+      h5::H5File<h5::AccessType::ReadWrite> h5_file{error_filename_1, true};
+      auto& dat_file = h5_file.try_insert<h5::Dat>(
+          "DatSubfile", std::vector<std::string>{"Time", "Blah"}, 0);
+      dat_file.append(std::vector{1.0, 0.0});
+    }
+    {
+      h5::H5File<h5::AccessType::ReadWrite> h5_file{error_filename_2, true};
+      auto& dat_file = h5_file.try_insert<h5::Dat>(
+          "DatSubfile", std::vector<std::string>{"Time", "Blah"}, 0);
+      dat_file.append(std::vector{0.0, 0.0});
+    }
+    CHECK_THROWS_WITH(
+        h5::combine_h5_dat({error_filename_1, error_filename_2}, fake_file),
+        Catch::Matchers::ContainsSubstring(
+            "are not monotonically increasing in their first "
+            "times for dat file"));
+  }
 }
 }  // namespace