CERT-Polska · msm-code · Dec 18, 2022 · Dec 24, 2022 · Dec 24, 2022
diff --git a/libursa/Database.cpp b/libursa/Database.cpp
@@ -52,6 +52,21 @@ void Database::load_from_disk() {
         load_dataset(dataset_fname);
     }
 
+    // As a heuristics, use the biggest dataset to generate a ngram profile.
+    // This has several problems:
+    // - The biggest dataset may not exist. In this case, use empty profile.
+    // - The biggest dataset may change during the work of the database. We
+    //   don't (currently) implement any mechanism to reload the ngram profile.
+    auto biggest_dataset = std::max_element(
+        working_datasets.begin(), working_datasets.end(), [](auto *a, auto *b) {
+            return a->get_file_count() < b->get_file_count();
+        });
+    if (biggest_dataset == working_datasets.end()) {
+        profile = NgramProfile();
+    } else {
+        profile = (*biggest_dataset)->generate_ngram_profile();
+    }
+
     for (const auto &iterator : db_json["iterators"].items()) {
         DatabaseName name(db_base, "iterator", iterator.key(),
                           iterator.value());
@@ -288,5 +303,5 @@ DatabaseSnapshot Database::snapshot() {
     }
 
     return DatabaseSnapshot(db_name, db_base, config_, iterators, cds,
-                            taskspecs);
+                            taskspecs, &profile);
 }
diff --git a/libursa/Database.h b/libursa/Database.h
@@ -16,8 +16,16 @@ class Database {
     fs::path db_name;
     fs::path db_base;
     std::map<std::string, OnDiskIterator> iterators;
+    // Datasets that are a part of database. New tasks will only use working
+    // datasets for their operations.
     std::vector<OnDiskDataset *> working_datasets;
+    // Datasets that are loaded into memory (probably used by at least one
+    // task running in the db). We can't always unload dataset immediately
+    // after dropping it, because it may be used by a task.
     std::vector<std::unique_ptr<OnDiskDataset>> loaded_datasets;
+    // Ngram profile is a set of heuristic information about the "expected"
+    // relative sizes of data related to various ngrams.
+    NgramProfile profile;
     DatabaseConfig config_;
 
     uint64_t last_task_id;

diff --git a/libursa/DatabaseSnapshot.cpp b/libursa/DatabaseSnapshot.cpp
@@ -13,13 +13,14 @@ DatabaseSnapshot::DatabaseSnapshot(
     fs::path db_name, fs::path db_base, DatabaseConfig config,
     std::map<std::string, OnDiskIterator> iterators,
     std::vector<const OnDiskDataset *> datasets,
-    std::unordered_map<uint64_t, TaskSpec> tasks)
+    std::unordered_map<uint64_t, TaskSpec> tasks, const NgramProfile *profile)
     : db_name(std::move(db_name)),
       db_base(std::move(db_base)),
       iterators(std::move(iterators)),
       config(std::move(config)),
       datasets(std::move(datasets)),
-      tasks(std::move(tasks)) {}
+      tasks(std::move(tasks)),
+      profile(profile) {}
 
 const OnDiskDataset *DatabaseSnapshot::find_dataset(
     const std::string &name) const {
@@ -234,7 +235,7 @@ QueryCounters DatabaseSnapshot::execute(const Query &query,
         if (!ds->has_all_taints(taints)) {
             continue;
         }
-        ds->execute(query, out, &counters);
+        ds->execute(query, out, &counters, *profile);
     }
     return counters;
 }

diff --git a/libursa/DatabaseSnapshot.h b/libursa/DatabaseSnapshot.h
@@ -22,6 +22,7 @@ class DatabaseSnapshot {
     std::map<std::string, OnDiskIterator> iterators;
     DatabaseConfig config;
     std::vector<const OnDiskDataset *> datasets;
+    const NgramProfile *profile;
     std::set<std::string> locked_datasets;
     std::set<std::string> locked_iterators;
     std::unordered_map<uint64_t, TaskSpec> tasks;
@@ -57,7 +58,8 @@ class DatabaseSnapshot {
     DatabaseSnapshot(fs::path db_name, fs::path db_base, DatabaseConfig config,
                      std::map<std::string, OnDiskIterator> iterators,
                      std::vector<const OnDiskDataset *> datasets,
-                     std::unordered_map<uint64_t, TaskSpec> tasks);
+                     std::unordered_map<uint64_t, TaskSpec> tasks,
+                     const NgramProfile *profile);
 
     DatabaseName derive_name(const DatabaseName &original,
                              const std::string &type) const {

diff --git a/libursa/OnDiskDataset.cpp b/libursa/OnDiskDataset.cpp
@@ -10,6 +10,20 @@
 #include "Query.h"
 #include "spdlog/spdlog.h"
 
+uint64_t NgramProfile::size_in_bytes(PrimitiveQuery primitive) const {
+    if (profiles.empty()) {
+        // The profile is empty - return the same estimate for everything.
+        return 0;
+    }
+    for (auto &[key, profile] : profiles) {
+        if (key == primitive.itype) {
+            return profile.at(primitive.trigram + 1) -
+                   profile.at(primitive.trigram);
+        }
+    }
+    throw std::runtime_error("Unexpected ngram type in ngram profile");
+}
+
 void OnDiskDataset::save() {
     std::set<std::string> index_names;
     for (const auto &name : indices) {
@@ -85,12 +99,16 @@ QueryResult OnDiskDataset::query(const Query &query,
 }
 
 void OnDiskDataset::execute(const Query &query, ResultWriter *out,
-                            QueryCounters *counters) const {
+                            QueryCounters *counters,
+                            const NgramProfile &profile) const {
     std::unordered_set<IndexType> types_to_query;
     for (const auto &ndx : get_indexes()) {
         types_to_query.emplace(ndx.index_type());
     }
-    const Query plan = query.plan(types_to_query);
+    PrimitiveEvaluator evaluator = [&profile](PrimitiveQuery primitive) {
+        return profile.size_in_bytes(primitive);
+    };
+    const Query plan = query.plan(types_to_query, evaluator);
 
     QueryResult result = this->query(plan, counters);
     if (result.is_everything()) {
@@ -312,3 +330,14 @@ std::vector<const OnDiskDataset *> OnDiskDataset::get_compact_candidates(
 
     return out;
 }
+
+// Generates a ngram profile from a given dataset. This is basically just
+// copying run offsets from all indexes to a new NgramProfile object.
+NgramProfile OnDiskDataset::generate_ngram_profile() const {
+    std::map<IndexType, std::vector<uint64_t>> profiles;
+    for (const auto &index : indices) {
+        profiles.emplace(index.index_type(),
+                         std::move(index.read_run_offsets()));
+    }
+    return NgramProfile(std::move(profiles));
+}
diff --git a/libursa/OnDiskDataset.h b/libursa/OnDiskDataset.h
@@ -15,6 +15,39 @@
 #include "ResultWriter.h"
 #include "Task.h"
 
+// This class contains statistics about how common various ngrams are
+// in the dataset. It is used during optimisation phase, to run queries on
+// small datasets first (to speed up the overall process).
+// There is a single ngram profile per database to save RAM. We could read
+// this information from disk directly, but we want to avoid reading from
+// disk when possible (after all this is the point of this class).
+class NgramProfile {
+   private:
+    // The vectors here are run offsets from OnDiskIndex, i.e. ngram X spans
+    // bytes from vector[X] to vector[X+1].
+    std::map<IndexType, std::vector<uint64_t>> profiles;
+
+   public:
+    NgramProfile() : profiles() {}
+    NgramProfile(std::map<IndexType, std::vector<uint64_t>> &&profiles)
+        : profiles(std::move(profiles)) {}
+    NgramProfile &operator=(NgramProfile &&other) = default;
+    NgramProfile(NgramProfile &&other) = default;
+    NgramProfile(const NgramProfile &other) = delete;
+
+    // Returns the size in bytes of data for a given ngram.
+    // Worth noting that the specific number is not important. What matters is
+    // that - on average - more common ngrams will return bigger values.
+    uint64_t size_in_bytes(PrimitiveQuery primitive) const;
+};
+
+// Represents a single dataset. Dataset is the smallest independent data
+// component in mquery. For example, it's entirely possible to copy dataset
+// from one server into another and expect it to work in the same way.
+// Dataset has:
+// - An unique name.
+// - A set of 1 or more indexes (up to one of gram3, text4, wide8, hash4).
+// - List of filenames contained in this dataset.
 class OnDiskDataset {
     std::string name;
     fs::path db_base;
@@ -42,8 +75,8 @@ class OnDiskDataset {
     }
     void toggle_taint(const std::string &taint);
     bool has_all_taints(const std::set<std::string> &taints) const;
-    void execute(const Query &query, ResultWriter *out,
-                 QueryCounters *counters) const;
+    void execute(const Query &query, ResultWriter *out, QueryCounters *counters,
+                 const NgramProfile &profile) const;
     uint64_t get_file_count() const { return files_index->get_file_count(); }
     void for_each_filename(std::function<void(const std::string &)> cb) const {
         files_index->for_each_filename(cb);
@@ -58,6 +91,7 @@ class OnDiskDataset {
     const std::set<std::string> &get_taints() const { return taints; }
     static std::vector<const OnDiskDataset *> get_compact_candidates(
         const std::vector<const OnDiskDataset *> &datasets);
+    NgramProfile generate_ngram_profile() const;
 
     // Returns vectors of compatible datasets. Datasets are called compatible
     // when they can be merged with each other - they have the same types and

diff --git a/libursa/OnDiskIndex.cpp b/libursa/OnDiskIndex.cpp
@@ -109,6 +109,11 @@ uint64_t find_max_batch(const std::vector<IndexMergeHelper> &indexes,
     return NUM_TRIGRAMS - trigram;
 }
 
+uint64_t OnDiskIndex::run_size_in_bytes(TriGram trigram) const {
+    auto [start, end] = get_run_offsets(trigram);
+    return end - start;
+}
+
 // Merge the indexes, and stream the results to the `out` stream immediately.
 // This function tries to batch reads, which makes it much more efficient on
 // HDDs (on SSDs the difference is not noticeable).

diff --git a/libursa/OnDiskIndex.h b/libursa/OnDiskIndex.h
@@ -37,6 +37,9 @@ class OnDiskIndex {
     const std::string &get_fname() const { return fname; }
     const fs::path &get_fpath() const { return fpath; }
     IndexType index_type() const { return ntype; }
+
+    // Gets a run size of the given trigram in bytes.
+    uint64_t run_size_in_bytes(TriGram trigram) const;
     QueryResult query(TriGram trigram, QueryCounters *counters) const;
     uint64_t real_size() const;
     static void on_disk_merge(const fs::path &db_base, const std::string &fname,

diff --git a/libursa/Query.cpp b/libursa/Query.cpp
@@ -181,19 +181,24 @@ std::vector<PrimitiveQuery> plan_qstring(
     return std::move(plan);
 }
 
-Query Query::plan(const std::unordered_set<IndexType> &types_to_query) const {
+Query Query::plan(const std::unordered_set<IndexType> &types_to_query,
+                  const PrimitiveEvaluator &evaluate) const {
     if (type != QueryType::PRIMITIVE) {
         std::vector<Query> plans;
         for (const auto &query : queries) {
-            plans.emplace_back(query.plan(types_to_query));
+            plans.emplace_back(query.plan(types_to_query, evaluate));
         }
         if (type == QueryType::MIN_OF) {
             return Query(count, std::move(plans));
         }
         return Query(type, std::move(plans));
     }
 
-    return Query(plan_qstring(types_to_query, value));
+    std::vector<PrimitiveQuery> plan = plan_qstring(types_to_query, value);
+    std::sort(plan.begin(), plan.end(), [&evaluate](auto l, auto r) {
+        return evaluate(l) < evaluate(r);
+    });
+    return Query(std::move(plan));
 }
 
 QueryResult Query::run(const QueryPrimitive &primitive,

diff --git a/libursa/Query.h b/libursa/Query.h
@@ -22,8 +22,8 @@ class PrimitiveQuery {
     PrimitiveQuery(IndexType itype, TriGram trigram)
         : itype(itype), trigram(trigram) {}
 
-    const IndexType itype;
-    const TriGram trigram;
+    IndexType itype;
+    TriGram trigram;
 
     // We want to use PrimitiveQuery in STL containers, and this means they
     // must be comparable using <. Specific order doesn't matter.
@@ -33,6 +33,11 @@ class PrimitiveQuery {
 using QueryPrimitive =
     std::function<QueryResult(PrimitiveQuery, QueryCounters *counter)>;
 
+// Type of function used to evaluate a given PrimitiveQuery.
+// The lower the result is, the smaller the (expected) size of the result
+// is and the higher priority of the given ngram should be.
+using PrimitiveEvaluator = std::function<uint32_t(PrimitiveQuery)>;
+
 // Query represents the query as provided by the user.
 // Query can contain subqueries (using AND/OR/MINOF) or be a literal query.
 // There are actually two types of literal query objects - "plain" and
@@ -62,7 +67,8 @@ class Query {
 
     QueryResult run(const QueryPrimitive &primitive,
                     QueryCounters *counters) const;
-    Query plan(const std::unordered_set<IndexType> &types_to_query) const;
+    Query plan(const std::unordered_set<IndexType> &types_to_query,
+               const PrimitiveEvaluator &evaluate) const;
 
    private:
     QueryType type;