Skip to content

Commit

Permalink
reduce latency creating datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
awenocur committed Aug 13, 2024
1 parent bbe02af commit 89fc233
Show file tree
Hide file tree
Showing 8 changed files with 58 additions and 21 deletions.
35 changes: 25 additions & 10 deletions libtiledbvcf/src/dataset/tiledbvcfdataset.cc
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,6 @@ void TileDBVCFDataset::create(const CreationParams& params) {
TILEDB_STRING_ASCII,
dataset_type.size(),
dataset_type.c_str());
group.close();

Metadata metadata;
metadata.tile_capacity = params.tile_capacity;
Expand All @@ -245,30 +244,46 @@ void TileDBVCFDataset::create(const CreationParams& params) {
}

// Create arrays and subgroups and add them to the root group
create_empty_metadata(ctx, params.uri, metadata, params.checksum);
create_empty_metadata(ctx, params.uri, metadata, params.checksum, group);
if (!group.is_open()) {
group.open(TILEDB_WRITE);
}
create_empty_data_array(
ctx,
params.uri,
metadata,
params.checksum,
params.allow_duplicates,
params.compress_sample_dim,
params.compression_level);
params.compression_level,
group);
if (!group.is_open()) {
group.open(TILEDB_WRITE);
}

if (params.enable_allele_count) {
AlleleCount::create(ctx, params.uri, params.checksum);
AlleleCount::create(ctx, params.uri, params.checksum, group);
if (!group.is_open()) {
group.open(TILEDB_WRITE);
}
}
if (params.enable_variant_stats) {
VariantStats::set_array_version(params.variant_stats_array_version);
VariantStats::create(ctx, params.uri, params.checksum);
VariantStats::create(ctx, params.uri, params.checksum, group);
if (!group.is_open()) {
group.open(TILEDB_WRITE);
}
}
if (params.enable_sample_stats) {
SampleStats::create(ctx, params.uri, params.checksum);
SampleStats::create(ctx, params.uri, group, params.checksum);
}

write_metadata_v4(ctx, params.uri, metadata);

// Log the group structure
if (group.is_open()) {
group.close();
}
group.open(TILEDB_READ);
LOG_DEBUG("TileDB Groups: \n{}", group.dump(true));

Expand Down Expand Up @@ -310,7 +325,8 @@ void TileDBVCFDataset::create_empty_metadata(
const Context& ctx,
const std::string& root_uri,
const Metadata& metadata,
const tiledb_filter_type_t& checksum) {
const tiledb_filter_type_t& checksum,
Group root_group) {
create_group(ctx, metadata_group_uri(root_uri));
create_sample_header_array(ctx, root_uri, checksum);

Expand All @@ -319,7 +335,6 @@ void TileDBVCFDataset::create_empty_metadata(

// Add arrays to the root group
// We add the vcf_header array to the root group to simplify array opening.
Group root_group(ctx, root_uri, TILEDB_WRITE);
auto array_uri = vcf_headers_uri(root_uri, relative);
LOG_DEBUG(
"Adding array name='{}' uri='{}' to group uri='{}'",
Expand All @@ -346,7 +361,8 @@ void TileDBVCFDataset::create_empty_data_array(
const tiledb_filter_type_t& checksum,
const bool allow_duplicates,
const bool compress_sample_dim,
const int compression_level) {
const int compression_level,
Group root_group) {
ArraySchema schema(ctx, TILEDB_SPARSE);
schema.set_capacity(metadata.tile_capacity);
schema.set_order({{TILEDB_ROW_MAJOR, TILEDB_ROW_MAJOR}});
Expand Down Expand Up @@ -464,7 +480,6 @@ void TileDBVCFDataset::create_empty_data_array(
DATA_ARRAY,
array_uri,
root_uri);
Group root_group(ctx, root_uri, TILEDB_WRITE);
root_group.add_member(array_uri, relative, DATA_ARRAY);
}

Expand Down
8 changes: 6 additions & 2 deletions libtiledbvcf/src/dataset/tiledbvcfdataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -854,12 +854,14 @@ class TileDBVCFDataset {
* @param root_uri Root URI of the dataset
* @param metadata General dataset metadata to write
* @param checksum optional checksum filter
* @param group TileDB-VCF group containing the array
*/
static void create_empty_metadata(
const Context& ctx,
const std::string& root_uri,
const Metadata& metadata,
const tiledb_filter_type_t& checksum);
const tiledb_filter_type_t& checksum,
Group group);

/**
* Creates the empty sample data array for a new dataset.
Expand All @@ -868,6 +870,7 @@ class TileDBVCFDataset {
* @param root_uri Root URI of the dataset
* @param metadata Dataset metadata containing tile capacity etc. to use
* @param checksum optional checksum filter
* @param group TileDB-VCF group containing the array
*/
static void create_empty_data_array(
const Context& ctx,
Expand All @@ -876,7 +879,8 @@ class TileDBVCFDataset {
const tiledb_filter_type_t& checksum,
const bool allow_duplicates,
const bool compress_sample_dim,
const int compression_level);
const int compression_level,
Group group);

/**
* Creates the empty sample header array for a new dataset.
Expand Down
6 changes: 4 additions & 2 deletions libtiledbvcf/src/stats/allele_count.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,10 @@ std::string AlleleCount::get_uri(const Group& group) {
}

void AlleleCount::create(
Context& ctx, const std::string& root_uri, tiledb_filter_type_t checksum) {
Context& ctx,
const std::string& root_uri,
tiledb_filter_type_t checksum,
Group root_group) {
LOG_DEBUG("[AlleleCount] Create array");

// Create filter lists
Expand Down Expand Up @@ -120,7 +123,6 @@ void AlleleCount::create(
auto relative = !utils::starts_with(root_uri, "tiledb://");
auto array_uri = get_uri(root_uri, relative);
LOG_DEBUG("Adding array '{}' to group '{}'", array_uri, root_uri);
Group root_group(ctx, root_uri, TILEDB_WRITE);
root_group.add_member(array_uri, relative, ALLELE_COUNT_ARRAY);
}

Expand Down
6 changes: 5 additions & 1 deletion libtiledbvcf/src/stats/allele_count.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,13 @@ class AlleleCount {
* @param ctx TileDB context
* @param root_uri TileDB-VCF dataset uri
* @param checksum TileDB checksum filter
* @param group TileDB-VCF group containing the array
*/
static void create(
Context& ctx, const std::string& root_uri, tiledb_filter_type_t checksum);
Context& ctx,
const std::string& root_uri,
tiledb_filter_type_t checksum,
Group group);

/**
* @brief Check if the array exists.
Expand Down
6 changes: 4 additions & 2 deletions libtiledbvcf/src/stats/sample_stats.cc
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,10 @@ std::string SampleStats::get_uri_(const std::string& root_uri, bool relative) {
}

void SampleStats::create(
Context& ctx, const std::string& root_uri, int compression_level) {
Context& ctx,
const std::string& root_uri,
Group root_group,
int compression_level) {
// Create filter lists
FilterList int_fl(ctx);
FilterList float_fl(ctx);
Expand Down Expand Up @@ -155,7 +158,6 @@ void SampleStats::create(
auto array_uri = get_uri_(root_uri, relative);
LOG_DEBUG(
"[SampleStats] Adding array '{}' to group '{}'", array_uri, root_uri);
Group root_group(ctx, root_uri, TILEDB_WRITE);
root_group.add_member(array_uri, relative, SAMPLE_STATS_ARRAY);
}

Expand Down
6 changes: 5 additions & 1 deletion libtiledbvcf/src/stats/sample_stats.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,14 @@ class SampleStats {
*
* @param ctx TileDB context
* @param root_uri URI of TileDB-VCF dataset
* @param group TileDB-VCF group containing the array
* @param compression_level zstd compression level
*/
static void create(
Context& ctx, const std::string& root_uri, int compression_level = 9);
Context& ctx,
const std::string& root_uri,
Group group,
int compression_level = 9);

/**
* @brief Check if the array exists.
Expand Down
6 changes: 4 additions & 2 deletions libtiledbvcf/src/stats/variant_stats.cc
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,10 @@ std::string VariantStats::get_uri(const Group& group) {
}

void VariantStats::create(
Context& ctx, const std::string& root_uri, tiledb_filter_type_t checksum) {
Context& ctx,
const std::string& root_uri,
tiledb_filter_type_t checksum,
Group root_group) {
LOG_DEBUG("[VariantStats] Create array");

// Create filter lists
Expand Down Expand Up @@ -167,7 +170,6 @@ void VariantStats::create(
auto relative = !utils::starts_with(root_uri, "tiledb://");
auto array_uri = get_uri(root_uri, relative);
LOG_DEBUG("Adding array '{}' to group '{}'", array_uri, root_uri);
Group root_group(ctx, root_uri, TILEDB_WRITE);
root_group.add_member(array_uri, relative, VARIANT_STATS_ARRAY);
}

Expand Down
6 changes: 5 additions & 1 deletion libtiledbvcf/src/stats/variant_stats.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,13 @@ class VariantStats {
*
* @param group TileDB-VCF dataset group
* @param checksum TileDB checksum filter
* @param group TileDB-VCF group containing the array
*/
static void create(
Context& ctx, const std::string& root_uri, tiledb_filter_type_t checksum);
Context& ctx,
const std::string& root_uri,
tiledb_filter_type_t checksum,
Group group);

/**
* @brief Check if the array exists.
Expand Down

0 comments on commit 89fc233

Please sign in to comment.