Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance improvement 3: ngram profiles #203

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion libursa/Database.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,21 @@ void Database::load_from_disk() {
load_dataset(dataset_fname);
}

// As a heuristics, use the biggest dataset to generate a ngram profile.
// This has several problems:
// - The biggest dataset may not exist. In this case, use empty profile.
// - The biggest dataset may change during the work of the database. We
// don't (currently) implement any mechanism to reload the ngram profile.
auto biggest_dataset = std::max_element(
working_datasets.begin(), working_datasets.end(), [](auto *a, auto *b) {
return a->get_file_count() < b->get_file_count();
});
if (biggest_dataset == working_datasets.end()) {
profile = NgramProfile();
} else {
profile = (*biggest_dataset)->generate_ngram_profile();
}

for (const auto &iterator : db_json["iterators"].items()) {
DatabaseName name(db_base, "iterator", iterator.key(),
iterator.value());
Expand Down Expand Up @@ -288,5 +303,5 @@ DatabaseSnapshot Database::snapshot() {
}

return DatabaseSnapshot(db_name, db_base, config_, iterators, cds,
taskspecs);
taskspecs, &profile);
}
8 changes: 8 additions & 0 deletions libursa/Database.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,16 @@ class Database {
fs::path db_name;
fs::path db_base;
std::map<std::string, OnDiskIterator> iterators;
// Datasets that are a part of database. New tasks will only use working
// datasets for their operations.
std::vector<OnDiskDataset *> working_datasets;
// Datasets that are loaded into memory (probably used by at least one
// task running in the db). We can't always unload dataset immediately
// after dropping it, because it may be used by a task.
std::vector<std::unique_ptr<OnDiskDataset>> loaded_datasets;
// Ngram profile is a set of heuristic information about the "expected"
// relative sizes of data related to various ngrams.
NgramProfile profile;
DatabaseConfig config_;

uint64_t last_task_id;
Expand Down
7 changes: 4 additions & 3 deletions libursa/DatabaseSnapshot.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,14 @@ DatabaseSnapshot::DatabaseSnapshot(
fs::path db_name, fs::path db_base, DatabaseConfig config,
std::map<std::string, OnDiskIterator> iterators,
std::vector<const OnDiskDataset *> datasets,
std::unordered_map<uint64_t, TaskSpec> tasks)
std::unordered_map<uint64_t, TaskSpec> tasks, const NgramProfile *profile)
: db_name(std::move(db_name)),
db_base(std::move(db_base)),
iterators(std::move(iterators)),
config(std::move(config)),
datasets(std::move(datasets)),
tasks(std::move(tasks)) {}
tasks(std::move(tasks)),
profile(profile) {}

const OnDiskDataset *DatabaseSnapshot::find_dataset(
const std::string &name) const {
Expand Down Expand Up @@ -234,7 +235,7 @@ QueryCounters DatabaseSnapshot::execute(const Query &query,
if (!ds->has_all_taints(taints)) {
continue;
}
ds->execute(query, out, &counters);
ds->execute(query, out, &counters, *profile);
}
return counters;
}
Expand Down
4 changes: 3 additions & 1 deletion libursa/DatabaseSnapshot.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ class DatabaseSnapshot {
std::map<std::string, OnDiskIterator> iterators;
DatabaseConfig config;
std::vector<const OnDiskDataset *> datasets;
const NgramProfile *profile;
std::set<std::string> locked_datasets;
std::set<std::string> locked_iterators;
std::unordered_map<uint64_t, TaskSpec> tasks;
Expand Down Expand Up @@ -57,7 +58,8 @@ class DatabaseSnapshot {
DatabaseSnapshot(fs::path db_name, fs::path db_base, DatabaseConfig config,
std::map<std::string, OnDiskIterator> iterators,
std::vector<const OnDiskDataset *> datasets,
std::unordered_map<uint64_t, TaskSpec> tasks);
std::unordered_map<uint64_t, TaskSpec> tasks,
const NgramProfile *profile);

DatabaseName derive_name(const DatabaseName &original,
const std::string &type) const {
Expand Down
33 changes: 31 additions & 2 deletions libursa/OnDiskDataset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,20 @@
#include "Query.h"
#include "spdlog/spdlog.h"

uint64_t NgramProfile::size_in_bytes(PrimitiveQuery primitive) const {
if (profiles.empty()) {
// The profile is empty - return the same estimate for everything.
return 0;
}
for (auto &[key, profile] : profiles) {
if (key == primitive.itype) {
return profile.at(primitive.trigram + 1) -
profile.at(primitive.trigram);
}
}
throw std::runtime_error("Unexpected ngram type in ngram profile");
}

void OnDiskDataset::save() {
std::set<std::string> index_names;
for (const auto &name : indices) {
Expand Down Expand Up @@ -85,12 +99,16 @@ QueryResult OnDiskDataset::query(const Query &query,
}

void OnDiskDataset::execute(const Query &query, ResultWriter *out,
QueryCounters *counters) const {
QueryCounters *counters,
const NgramProfile &profile) const {
std::unordered_set<IndexType> types_to_query;
for (const auto &ndx : get_indexes()) {
types_to_query.emplace(ndx.index_type());
}
const Query plan = query.plan(types_to_query);
PrimitiveEvaluator evaluator = [&profile](PrimitiveQuery primitive) {
return profile.size_in_bytes(primitive);
};
const Query plan = query.plan(types_to_query, evaluator);

QueryResult result = this->query(plan, counters);
if (result.is_everything()) {
Expand Down Expand Up @@ -312,3 +330,14 @@ std::vector<const OnDiskDataset *> OnDiskDataset::get_compact_candidates(

return out;
}

// Generates a ngram profile from a given dataset. This is basically just
// copying run offsets from all indexes to a new NgramProfile object.
NgramProfile OnDiskDataset::generate_ngram_profile() const {
std::map<IndexType, std::vector<uint64_t>> profiles;
for (const auto &index : indices) {
profiles.emplace(index.index_type(),
std::move(index.read_run_offsets()));
}
return NgramProfile(std::move(profiles));
}
38 changes: 36 additions & 2 deletions libursa/OnDiskDataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,39 @@
#include "ResultWriter.h"
#include "Task.h"

// This class contains statistics about how common various ngrams are
// in the dataset. It is used during optimisation phase, to run queries on
// small datasets first (to speed up the overall process).
// There is a single ngram profile per database to save RAM. We could read
// this information from disk directly, but we want to avoid reading from
// disk when possible (after all this is the point of this class).
class NgramProfile {
private:
// The vectors here are run offsets from OnDiskIndex, i.e. ngram X spans
// bytes from vector[X] to vector[X+1].
std::map<IndexType, std::vector<uint64_t>> profiles;

public:
NgramProfile() : profiles() {}
NgramProfile(std::map<IndexType, std::vector<uint64_t>> &&profiles)
: profiles(std::move(profiles)) {}
NgramProfile &operator=(NgramProfile &&other) = default;
NgramProfile(NgramProfile &&other) = default;
NgramProfile(const NgramProfile &other) = delete;

// Returns the size in bytes of data for a given ngram.
// Worth noting that the specific number is not important. What matters is
// that - on average - more common ngrams will return bigger values.
uint64_t size_in_bytes(PrimitiveQuery primitive) const;
};

// Represents a single dataset. Dataset is the smallest independent data
// component in mquery. For example, it's entirely possible to copy dataset
// from one server into another and expect it to work in the same way.
// Dataset has:
// - An unique name.
// - A set of 1 or more indexes (up to one of gram3, text4, wide8, hash4).
// - List of filenames contained in this dataset.
class OnDiskDataset {
std::string name;
fs::path db_base;
Expand Down Expand Up @@ -42,8 +75,8 @@ class OnDiskDataset {
}
void toggle_taint(const std::string &taint);
bool has_all_taints(const std::set<std::string> &taints) const;
void execute(const Query &query, ResultWriter *out,
QueryCounters *counters) const;
void execute(const Query &query, ResultWriter *out, QueryCounters *counters,
const NgramProfile &profile) const;
uint64_t get_file_count() const { return files_index->get_file_count(); }
void for_each_filename(std::function<void(const std::string &)> cb) const {
files_index->for_each_filename(cb);
Expand All @@ -58,6 +91,7 @@ class OnDiskDataset {
const std::set<std::string> &get_taints() const { return taints; }
static std::vector<const OnDiskDataset *> get_compact_candidates(
const std::vector<const OnDiskDataset *> &datasets);
NgramProfile generate_ngram_profile() const;

// Returns vectors of compatible datasets. Datasets are called compatible
// when they can be merged with each other - they have the same types and
Expand Down
5 changes: 5 additions & 0 deletions libursa/OnDiskIndex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,11 @@ uint64_t find_max_batch(const std::vector<IndexMergeHelper> &indexes,
return NUM_TRIGRAMS - trigram;
}

uint64_t OnDiskIndex::run_size_in_bytes(TriGram trigram) const {
auto [start, end] = get_run_offsets(trigram);
return end - start;
}

// Merge the indexes, and stream the results to the `out` stream immediately.
// This function tries to batch reads, which makes it much more efficient on
// HDDs (on SSDs the difference is not noticeable).
Expand Down
3 changes: 3 additions & 0 deletions libursa/OnDiskIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ class OnDiskIndex {
const std::string &get_fname() const { return fname; }
const fs::path &get_fpath() const { return fpath; }
IndexType index_type() const { return ntype; }

// Gets a run size of the given trigram in bytes.
uint64_t run_size_in_bytes(TriGram trigram) const;
QueryResult query(TriGram trigram, QueryCounters *counters) const;
uint64_t real_size() const;
static void on_disk_merge(const fs::path &db_base, const std::string &fname,
Expand Down
11 changes: 8 additions & 3 deletions libursa/Query.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,19 +181,24 @@ std::vector<PrimitiveQuery> plan_qstring(
return std::move(plan);
}

Query Query::plan(const std::unordered_set<IndexType> &types_to_query) const {
Query Query::plan(const std::unordered_set<IndexType> &types_to_query,
const PrimitiveEvaluator &evaluate) const {
if (type != QueryType::PRIMITIVE) {
std::vector<Query> plans;
for (const auto &query : queries) {
plans.emplace_back(query.plan(types_to_query));
plans.emplace_back(query.plan(types_to_query, evaluate));
}
if (type == QueryType::MIN_OF) {
return Query(count, std::move(plans));
}
return Query(type, std::move(plans));
}

return Query(plan_qstring(types_to_query, value));
std::vector<PrimitiveQuery> plan = plan_qstring(types_to_query, value);
std::sort(plan.begin(), plan.end(), [&evaluate](auto l, auto r) {
return evaluate(l) < evaluate(r);
});
return Query(std::move(plan));
}

QueryResult Query::run(const QueryPrimitive &primitive,
Expand Down
12 changes: 9 additions & 3 deletions libursa/Query.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ class PrimitiveQuery {
PrimitiveQuery(IndexType itype, TriGram trigram)
: itype(itype), trigram(trigram) {}

const IndexType itype;
const TriGram trigram;
IndexType itype;
TriGram trigram;

// We want to use PrimitiveQuery in STL containers, and this means they
// must be comparable using <. Specific order doesn't matter.
Expand All @@ -33,6 +33,11 @@ class PrimitiveQuery {
using QueryPrimitive =
std::function<QueryResult(PrimitiveQuery, QueryCounters *counter)>;

// Type of function used to evaluate a given PrimitiveQuery.
// The lower the result is, the smaller the (expected) size of the result
// is and the higher priority of the given ngram should be.
using PrimitiveEvaluator = std::function<uint32_t(PrimitiveQuery)>;

// Query represents the query as provided by the user.
// Query can contain subqueries (using AND/OR/MINOF) or be a literal query.
// There are actually two types of literal query objects - "plain" and
Expand Down Expand Up @@ -62,7 +67,8 @@ class Query {

QueryResult run(const QueryPrimitive &primitive,
QueryCounters *counters) const;
Query plan(const std::unordered_set<IndexType> &types_to_query) const;
Query plan(const std::unordered_set<IndexType> &types_to_query,
const PrimitiveEvaluator &evaluate) const;

private:
QueryType type;
Expand Down