Skip to content

Commit

Permalink
fixup. Combine H5 dat
Browse files Browse the repository at this point in the history
  • Loading branch information
knelli2 committed Sep 27, 2024
1 parent e9f4bcf commit a67d874
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 60 deletions.
94 changes: 50 additions & 44 deletions src/IO/H5/CombineH5.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@
#include "Utilities/Algorithm.hpp"
#include "Utilities/ErrorHandling/Error.hpp"
#include "Utilities/FileSystem.hpp"
#include "Utilities/Gsl.hpp"
#include "Utilities/MakeString.hpp"
#include "Utilities/Numeric.hpp"
#include "Utilities/Serialization/Serialize.hpp"
#include "Utilities/StdHelpers.hpp"

Expand Down Expand Up @@ -323,12 +323,10 @@ void combine_h5_dat(const std::vector<std::string>& h5_files_to_combine,
return;
}

// For each dat subfile, this holds the number of times from each of the H5
// files to combine so that we always use the latest times. A nullopt means no
// times from that H5 file will be used. The std::vector should be the same
// length as the number of H5 files to combine
std::unordered_map<std::string, std::vector<std::optional<size_t>>>
num_time_map{};
// For each dat subfile, this holds the number of *sorted* times from each of
// the H5 files to combine so that we always use the latest times. The
// std::vector should be the same length as the number of H5 files to combine
std::unordered_map<std::string, std::vector<size_t>> num_time_map{};

// The outer loop is over dat files because we don't require different dat
// files to have the same times
Expand All @@ -355,10 +353,14 @@ void combine_h5_dat(const std::vector<std::string>& h5_files_to_combine,
}

// Only grab the times for now
const Matrix times = dat_file.get_data_subset({0}, 0, dimensions[0]);
auto times = dat_file.get_data_subset<std::vector<std::vector<double>>>(
{0}, 0, dimensions[0]);
alg::sort(times,
[](const std::vector<double>& v1,
const std::vector<double>& v2) { return v1[0] < v2[0]; });

// Makes things easier below.
if (UNLIKELY(times.rows() == 0)) {
if (UNLIKELY(times.size() == 0)) {

Check failure on line 363 in src/IO/H5/CombineH5.cpp

View workflow job for this annotation

GitHub Actions / Clang-tidy (Debug)

the 'empty' method should be used to check for emptiness instead of 'size'

Check failure on line 363 in src/IO/H5/CombineH5.cpp

View workflow job for this annotation

GitHub Actions / Clang-tidy (Release)

the 'empty' method should be used to check for emptiness instead of 'size'
ERROR_NO_TRACE("No times in dat file " << dat_filename << " in H5 file "
<< filename);
}
Expand All @@ -373,35 +375,43 @@ void combine_h5_dat(const std::vector<std::string>& h5_files_to_combine,
<< " doesn't match other H5 files.");
}

// This is the first file. We can't make any decisions here so just store
// the number of times and the earliest time of this file
// This is the first (last) file. We can't make any decisions here so just
// store the number of times and the earliest time of this file
if (not earliest_time.has_value()) {
num_time_map.at(dat_filename)[index] = dimensions[0];
earliest_time = times(0, 0);
earliest_time = times[0][0];
continue;
}

// Determine if the earliest time of the previous file (previous = later
// in the sequence of files since we are looping backward) is before
// any of the times in this file. If so, don't include those times.
std::optional<size_t> row = times.rows() - 1;
while (times(row.value(), 0) >= earliest_time.value()) {
// Once we get to the first time in this file, if it's still after the
// earliest time in the previous file, then we don't include any times
// from this file.
if (row.value() == 0) {
row.reset();
break;
// Check that the earliest time of the previous file (previous = later
// in the sequence of files since we are looping backward) is after the
// first time of this file. We require the files to be passed in
// increasing order by their first time.
if (UNLIKELY(times[0][0] >= earliest_time.value())) {
ERROR_NO_TRACE("The H5 files passed in "
<< h5_files_to_combine
<< " are not monotonically increasing in their first "
"times for dat file "
<< dat_filename << "");
}

// Determine if the earliest time of the previous file is before any of
// the times in this file. If so, don't include those times.
size_t row = times.size() - 1;
while (times[row][0] >= earliest_time.value()) {
// This should have been taken care of before so we should never get
// here
if (UNLIKELY(row == 0)) {
ERROR_NO_TRACE("Internal consistency error. Please file an issue.");
}
row.value()--;

row--;
}

// So long as this file contains some times that need to be combined,
// store the number of times and the first time in this file.
if (row.has_value()) {
num_time_map.at(dat_filename)[index] = row.value() + 1;
earliest_time = times(0, 0);
}
num_time_map.at(dat_filename)[index] = row + 1;
earliest_time = times[0][0];

file_to_combine.close_current_object();
}
Expand All @@ -415,7 +425,7 @@ void combine_h5_dat(const std::vector<std::string>& h5_files_to_combine,
ss << " Dat Subfile " << subfile_name << ":\n";
for (size_t i = 0; i < h5_files_num_times.size(); i++) {
ss << " H5 File " << h5_files_to_combine[i] << ": "
<< h5_files_num_times[i].value_or(0) << "\n";
<< h5_files_num_times[i] << "\n";
}
}

Expand All @@ -426,13 +436,6 @@ void combine_h5_dat(const std::vector<std::string>& h5_files_to_combine,
// combine, we open the output file and combine everything
h5::H5File<h5::AccessType::ReadWrite> output_h5_file{output_h5_filename};

const auto make_these_columns = [](const size_t size) -> std::vector<size_t> {
std::vector<size_t> these_columns{};
these_columns.resize(size);
alg::iota(these_columns, 0_st);
return these_columns;
};

// Now we loop over H5 files first to avoid unnecessary filesystem access
for (size_t i = 0; i < h5_files_to_combine.size(); i++) {
const std::string& filename = h5_files_to_combine[i];
Expand All @@ -451,15 +454,18 @@ void combine_h5_dat(const std::vector<std::string>& h5_files_to_combine,
&output_h5_file.insert<h5::Dat>(dat_filename, legend, version);
}

const std::optional<size_t>& num_times = num_time_map.at(dat_filename)[i];
const size_t num_times = num_time_map.at(dat_filename)[i];

// Only append data if we include data from this file
if (num_times.has_value()) {
// Always start with row 0
const Matrix data_to_append = input_dat_file.get_data_subset(
make_these_columns(legend.size()), 0, num_times.value());
output_dat_file->append(data_to_append);
}
// We must get all data first and sort it by times, because the number of
// times is only meaningful for the sorted data
auto data_to_append =
input_dat_file.get_data<std::vector<std::vector<double>>>();
alg::sort(data_to_append,
[](const std::vector<double>& v1,
const std::vector<double>& v2) { return v1[0] < v2[0]; });
data_to_append.resize(num_times);

output_dat_file->append(data_to_append);

file_to_combine.close_current_object();
output_h5_file.close_current_object();
Expand Down
17 changes: 10 additions & 7 deletions src/IO/H5/CombineH5.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,18 @@ void combine_h5_vol(const std::vector<std::string>& file_names,
* \brief Combine the `h5::Dat` subfiles of multiple `h5::H5File`s into a single
* H5 file.
*
* \details If there are overlapping times, the "latest" one is always used;
* meaning if you have data in `File1.h5` and `File2.h5` and if the first time
* in `File2.h5` is before some times in `File1.h5`, those times in `File1.h5`
* will be discarded and won't appear in the combined H5 file.
* \details The times in each `h5::Dat` subfile can be unordered. The necessary
* sorting will be handled in this function. However, the \p h5_files_to_combine
* must be mononitcally increasing in time; meaning the earliest time in
* `File1.h5` must come before the earliest time in `File2.h5`.
*
* If there are overlapping times, the "latest" one is always used;
* meaning if you have data in `File1.h5` and `File2.h5` and if the earliest
* time in `File2.h5` is before some times in `File1.h5`, those times in
* `File1.h5` will be discarded and won't appear in the combined H5 file.
*
* If the H5 files in \p h5_files_to_combine have other types of subfiles, those
* will be ignored and will not appear in \p output_h5_filename. This function
* also assumes that the times in each of the \p h5_files_to_combine are already
* sorted.
* will be ignored and will not appear in \p output_h5_filename.
*
* If \p h5_files_to_combine is empty, an error will occur.
*
Expand Down
42 changes: 33 additions & 9 deletions tests/Unit/IO/H5/Test_CombineH5.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,13 @@ void test_single_file() {
CHECK(dat_file.get_legend() == legend);
CHECK(dat_file.get_data() == data);
}

if (file_system::check_if_file_exists(combined_filename)) {
file_system::rm(combined_filename, true);
}
if (file_system::check_if_file_exists(original_filename)) {
file_system::rm(original_filename, true);
}
}

void test() {
Expand All @@ -54,22 +61,18 @@ void test() {
file_system::rm(combined_filename, true);
}
const std::vector<std::string> individual_filenames{
"RedRanger.h5", "BlackRanger.h5", "BlueRanger.h5", "YellowRanger.h5",
"PinkRanger.h5"};
"RedRanger.h5", "BlackRanger.h5", "BlueRanger.h5", "YellowRanger.h5"};
const std::vector<std::string> subfile_names{"Subfile1", "Subfile2"};
// All subfiles can just share the same legend. The data doesn't matter, only
// the times for this test.
const std::vector<std::string> legend{"Time", "Data"};
const std::vector<std::vector<std::vector<double>>> data{
// This file doesn't keep the last time
{{0.0, 0.0}, {1.0, 1.0}, {2.0, 0.0}},
// This file keeps all times
{{1.5, 2.0}, {2.1, 3.0}},
// This file is completely discarded because the next files' first time is
// the same as this first time
{{3.0, 0.0}, {4.0, 0.0}, {5.0, 0.0}},
// This file only keeps the first time
{{3.0, 4.0}, {4.1, 0.0}, {4.5, 0.0}},
// This file keeps all times, but is unordered
{{2.1, 3.0}, {1.5, 2.0}},
// This file only keeps the earliest time, but is unordered
{{4.1, 0.0}, {3.0, 4.0}, {4.5, 0.0}},
// This file keeps all times
{{4.0, 5.0}, {5.5, 6.0}, {6.0, 7.0}}};

Expand Down Expand Up @@ -215,6 +218,27 @@ void test_errors() {
}

delete_files();

{
INFO("Non monotonically increasing H5 files");
{
h5::H5File<h5::AccessType::ReadWrite> h5_file{error_filename_1, true};
auto& dat_file = h5_file.try_insert<h5::Dat>(
"DatSubfile", std::vector<std::string>{"Time", "Blah"}, 0);
dat_file.append(std::vector{1.0, 0.0});
}
{
h5::H5File<h5::AccessType::ReadWrite> h5_file{error_filename_2, true};
auto& dat_file = h5_file.try_insert<h5::Dat>(
"DatSubfile", std::vector<std::string>{"Time", "Blah"}, 0);
dat_file.append(std::vector{0.0, 0.0});
}
CHECK_THROWS_WITH(
h5::combine_h5_dat({error_filename_1, error_filename_2}, fake_file),
Catch::Matchers::ContainsSubstring(
"are not monotonically increasing in their first "
"times for dat file"));
}
}
} // namespace

Expand Down

0 comments on commit a67d874

Please sign in to comment.