From cc56caba255bcb62eb05a812ce1bc25724dfda90 Mon Sep 17 00:00:00 2001
From: PikaCat <Pika-Cat@qq.com>
Date: Thu, 25 Apr 2024 16:13:47 +0800
Subject: [PATCH] Implement accumulator refresh table

For each thread persist an accumulator cache for the network, where each
cache contains multiple entries for each of the possible king squares.
When the accumulator needs to be refreshed, the cached entry is used to more
efficiently update the accumulator, instead of rebuilding it from scratch.
This idea, was first described by Luecx (author of Koivisto) and
is commonly referred to as "Finny Tables".

When the accumulator needs to be refreshed, instead of filling it with
biases and adding every piece from scratch, we...

1. Take the `AccumulatorRefreshEntry` associated with the new king bucket
2. Calculate the features to activate and deactivate (from differences
   between bitboards in the entry and bitboards of the actual position)
3. Apply the updates on the refresh entry
4. Copy the content of the refresh entry accumulator to the accumulator
   we were refreshing
5. Copy the bitboards from the position to the refresh entry, to match
   the newly updated accumulator

No functional change
---
 src/evaluate.cpp                    |  18 ++-
 src/evaluate.h                      |   6 +-
 src/nnue/features/half_ka_v2_hm.cpp |  19 +---
 src/nnue/features/half_ka_v2_hm.h   |  21 +++-
 src/nnue/network.cpp                |  18 +--
 src/nnue/network.h                  |  12 +-
 src/nnue/nnue_accumulator.h         |  62 +++++++++-
 src/nnue/nnue_feature_transformer.h | 171 +++++++++++++++++-----------
 src/nnue/nnue_misc.cpp              |  23 ++--
 src/nnue/nnue_misc.h                |   9 +-
 src/search.cpp                      |  27 +++--
 src/search.h                        |   7 +-
 src/uci.cpp                         |   8 +-
 13 files changed, 266 insertions(+), 135 deletions(-)
diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 1363aa14a..c579a6487 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -25,12 +25,14 @@
 #include <iomanip>
 #include <iostream>
 #include <sstream>
+#include <memory>
 
 #include "nnue/network.h"
 #include "nnue/nnue_misc.h"
 #include "position.h"
 #include "types.h"
 #include "uci.h"
+#include "nnue/nnue_accumulator.h"
 
 namespace Stockfish {
 
@@ -46,7 +48,10 @@ int Eval::simple_eval(const Position& pos, Color c) {
 
 // Evaluate is the evaluator for the outer world. It returns a static evaluation
 // of the position from the point of view of the side to move.
-Value Eval::evaluate(const Eval::NNUE::Network& network, const Position& pos, int optimism) {
+Value Eval::evaluate(const Eval::NNUE::Network& network,
+                     const Position&            pos,
+                     NNUE::AccumulatorCaches&   caches,
+                     int                        optimism) {
 
     assert(!pos.checkers());
 
@@ -56,7 +61,7 @@ Value Eval::evaluate(const Eval::NNUE::Network& network, const Position& pos, in
     int   simpleEval = simple_eval(pos, stm);
 
     int   nnueComplexity;
-    Value nnue = network.evaluate(pos, true, &nnueComplexity);
+    Value nnue = network.evaluate(pos, &caches.cache, true, &nnueComplexity);
 
     // Blend optimism and eval with nnue complexity and material imbalance
     optimism += optimism * (nnueComplexity + std::abs(simpleEval - nnue)) / 729;
@@ -80,21 +85,24 @@ Value Eval::evaluate(const Eval::NNUE::Network& network, const Position& pos, in
 // Trace scores are from white's point of view
 std::string Eval::trace(Position& pos, const Eval::NNUE::Network& network) {
 
+    auto caches = std::make_unique<Eval::NNUE::AccumulatorCaches>();
+    caches->clear(network);
+
     if (pos.checkers())
         return "Final evaluation: none (in check)";
 
     std::stringstream ss;
     ss << std::showpoint << std::noshowpos << std::fixed << std::setprecision(2);
 
-    ss << '\n' << NNUE::trace(pos, network) << '\n';
+    ss << '\n' << NNUE::trace(pos, network, *caches) << '\n';
 
     ss << std::showpoint << std::showpos << std::fixed << std::setprecision(2) << std::setw(15);
 
-    Value v = network.evaluate(pos);
+    Value v = network.evaluate(pos, &caches->cache);
     v       = pos.side_to_move() == WHITE ? v : -v;
     ss << "NNUE evaluation        " << 0.01 * UCIEngine::to_cp(v, pos) << " (white side)\n";
 
-    v = evaluate(network, pos, VALUE_ZERO);
+    v = evaluate(network, pos, *caches, VALUE_ZERO);
     v = pos.side_to_move() == WHITE ? v : -v;
     ss << "Final evaluation       " << 0.01 * UCIEngine::to_cp(v, pos) << " (white side)";
     ss << " [with scaled NNUE, ...]";
diff --git a/src/evaluate.h b/src/evaluate.h
index 60fea5f6d..25b94ca64 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -37,12 +37,16 @@ namespace Eval {
 
 namespace NNUE {
 class Network;
+struct AccumulatorCaches;
 }
 
 std::string trace(Position& pos, const Eval::NNUE::Network& network);
 
 int   simple_eval(const Position& pos, Color c);
-Value evaluate(const NNUE::Network& network, const Position& pos, int optimism);
+Value evaluate(const NNUE::Network&           network,
+               const Position&                pos,
+               Eval::NNUE::AccumulatorCaches& caches,
+               int                            optimism);
 
 }  // namespace Eval
 
diff --git a/src/nnue/features/half_ka_v2_hm.cpp b/src/nnue/features/half_ka_v2_hm.cpp
index f01bc6eff..0a16df1b3 100644
--- a/src/nnue/features/half_ka_v2_hm.cpp
+++ b/src/nnue/features/half_ka_v2_hm.cpp
@@ -23,7 +23,7 @@
 #include "../../bitboard.h"
 #include "../../position.h"
 #include "../../types.h"
-#include "../nnue_common.h"
+#include "../nnue_accumulator.h"
 
 namespace Stockfish::Eval::NNUE::Features {
 
@@ -36,22 +36,9 @@ inline IndexType HalfKAv2_hm::make_index(Square s, Piece pc, Square ksq, int ab)
                      + PS_NB * ((KingBuckets[ksq] & 0x7) * 9 + ab));
 }
 
-// Get a list of indices for active features
-template<Color Perspective>
-void HalfKAv2_hm::append_active_indices(const Position& pos, IndexList& active) {
-    Square   ksq = pos.square<KING>(Perspective);
-    int      ab  = pos.count<ADVISOR>(Perspective) * 3 + pos.count<BISHOP>(Perspective);
-    Bitboard bb  = pos.pieces();
-    while (bb)
-    {
-        Square s = pop_lsb(bb);
-        active.push_back(make_index<Perspective>(s, pos.piece_on(s), ksq, ab));
-    }
-}
-
 // Explicit template instantiations
-template void HalfKAv2_hm::append_active_indices<WHITE>(const Position& pos, IndexList& active);
-template void HalfKAv2_hm::append_active_indices<BLACK>(const Position& pos, IndexList& active);
+template IndexType HalfKAv2_hm::make_index<WHITE>(Square s, Piece pc, Square ksq, int ab);
+template IndexType HalfKAv2_hm::make_index<BLACK>(Square s, Piece pc, Square ksq, int ab);
 
 // Get a list of indices for recently changed features
 template<Color Perspective>
diff --git a/src/nnue/features/half_ka_v2_hm.h b/src/nnue/features/half_ka_v2_hm.h
index 23f4e505c..5259ece36 100644
--- a/src/nnue/features/half_ka_v2_hm.h
+++ b/src/nnue/features/half_ka_v2_hm.h
@@ -64,10 +64,6 @@ class HalfKAv2_hm {
     };
     // clang-format on
 
-    // Index of a feature for a given king position and another piece on some square
-    template<Color Perspective>
-    static IndexType make_index(Square s, Piece pc, Square ksq, int ab);
-
    public:
     // Feature name
     static constexpr const char* Name = "HalfKAv2_hm";
@@ -95,6 +91,19 @@ class HalfKAv2_hm {
     };
 #undef M
 
+    static constexpr uint8_t KingCacheMaps[SQUARE_NB] = {
+        0,  0,  0,  0,  1,  2,  0,  0,  0,
+        0,  0,  0,  5,  4,  3,  0,  0,  0,
+        0,  0,  0,  6,  7,  8,  0,  0,  0,
+        0,  0,  0,  0,  0,  0,  0,  0,  0,
+        0,  0,  0,  0,  0,  0,  0,  0,  0,
+        0,  0,  0,  0,  0,  0,  0,  0,  0,
+        0,  0,  0,  0,  0,  0,  0,  0,  0,
+        0,  0,  0,  6,  7,  8,  0,  0,  0,
+        0,  0,  0,  5,  4,  3,  0,  0,  0,
+        0,  0,  0,  0,  1,  2,  0,  0,  0,
+    };
+
     // Map advisor and bishop location into White King plane
     static constexpr uint8_t ABMap[SQUARE_NB] = {
         0,  0,  0,  1,  0,  2,  5,  0,  0,
@@ -133,9 +142,9 @@ class HalfKAv2_hm {
     static constexpr IndexType MaxActiveDimensions = 32;
     using IndexList                                = ValueList<IndexType, MaxActiveDimensions>;
 
-    // Get a list of indices for active features
+    // Index of a feature for a given king position and another piece on some square
     template<Color Perspective>
-    static void append_active_indices(const Position& pos, IndexList& active);
+    static IndexType make_index(Square s, Piece pc, Square ksq, int ab);
 
     // Get a list of indices for recently changed features
     template<Color Perspective>
diff --git a/src/nnue/network.cpp b/src/nnue/network.cpp
index 5ec42b05f..93dbeaae3 100644
--- a/src/nnue/network.cpp
+++ b/src/nnue/network.cpp
@@ -126,7 +126,10 @@ bool Network::save(const std::optional<std::string>& filename) const {
 }
 
 
-Value Network::evaluate(const Position& pos, bool adjusted, int* complexity) const {
+Value Network::evaluate(const Position&           pos,
+                        AccumulatorCaches::Cache* cache,
+                        bool                      adjusted,
+                        int*                      complexity) const {
     // We manually align the arrays on the stack because with gcc < 9.3
     // overaligning stack variables with alignas() doesn't work correctly.
 
@@ -145,7 +148,7 @@ Value Network::evaluate(const Position& pos, bool adjusted, int* complexity) con
     ASSERT_ALIGNED(transformedFeatures, alignment);
 
     const int  bucket     = (pos.count<ALL_PIECES>() - 1) / 4;
-    const auto psqt       = featureTransformer->transform(pos, transformedFeatures, bucket);
+    const auto psqt       = featureTransformer->transform(pos, cache, transformedFeatures, bucket);
     const auto positional = network[bucket]->propagate(transformedFeatures);
 
     if (complexity)
@@ -188,12 +191,12 @@ void Network::verify(std::string evalfilePath) const {
 }
 
 
-void Network::hint_common_access(const Position& pos) const {
-    featureTransformer->hint_common_access(pos);
+void Network::hint_common_access(const Position& pos, AccumulatorCaches::Cache* cache) const {
+    featureTransformer->hint_common_access(pos, cache);
 }
 
 
-NnueEvalTrace Network::trace_evaluate(const Position& pos) const {
+NnueEvalTrace Network::trace_evaluate(const Position& pos, AccumulatorCaches::Cache* cache) const {
     // We manually align the arrays on the stack because with gcc < 9.3
     // overaligning stack variables with alignas() doesn't work correctly.
     constexpr uint64_t alignment = CacheLineSize;
@@ -214,8 +217,9 @@ NnueEvalTrace Network::trace_evaluate(const Position& pos) const {
     t.correctBucket = (pos.count<ALL_PIECES>() - 1) / 4;
     for (IndexType bucket = 0; bucket < LayerStacks; ++bucket)
     {
-        const auto materialist = featureTransformer->transform(pos, transformedFeatures, bucket);
-        const auto positional  = network[bucket]->propagate(transformedFeatures);
+        const auto materialist =
+          featureTransformer->transform(pos, cache, transformedFeatures, bucket);
+        const auto positional = network[bucket]->propagate(transformedFeatures);
 
         t.psqt[bucket]       = static_cast<Value>(materialist / OutputScale);
         t.positional[bucket] = static_cast<Value>(positional / OutputScale);
diff --git a/src/nnue/network.h b/src/nnue/network.h
index 13ecab11c..c489785f1 100644
--- a/src/nnue/network.h
+++ b/src/nnue/network.h
@@ -26,6 +26,7 @@
 #include "nnue_architecture.h"
 #include "nnue_feature_transformer.h"
 #include "nnue_misc.h"
+#include "nnue_accumulator.h"
 
 namespace Stockfish {
 
@@ -42,13 +43,16 @@ class Network {
     bool save(const std::optional<std::string>& filename) const;
 
 
-    Value evaluate(const Position& pos, bool adjusted = false, int* complexity = nullptr) const;
+    Value evaluate(const Position&           pos,
+                   AccumulatorCaches::Cache* cache,
+                   bool                      adjusted   = false,
+                   int*                      complexity = nullptr) const;
 
 
-    void hint_common_access(const Position& pos) const;
+    void hint_common_access(const Position& pos, AccumulatorCaches::Cache* cache) const;
 
     void          verify(std::string evalfilePath) const;
-    NnueEvalTrace trace_evaluate(const Position& pos) const;
+    NnueEvalTrace trace_evaluate(const Position& pos, AccumulatorCaches::Cache* cache) const;
 
    private:
     void load_user_net(const std::string&, const std::string&);
@@ -75,6 +79,8 @@ class Network {
     // Hash value of evaluation function structure
     static constexpr std::uint32_t hash =
       FeatureTransformer::get_hash_value() ^ NetworkArchitecture::get_hash_value();
+
+    friend struct AccumulatorCaches::Cache;
 };
 
 }  // namespace Stockfish::Eval::NNUE
diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h
index f6d705243..f3ab1a0a8 100644
--- a/src/nnue/nnue_accumulator.h
+++ b/src/nnue/nnue_accumulator.h
@@ -28,11 +28,67 @@
 
 namespace Stockfish::Eval::NNUE {
 
+using BiasType       = std::int16_t;
+using PSQTWeightType = std::int32_t;
+using IndexType      = std::uint32_t;
+
 // Class that holds the result of affine transformation of input features
 struct alignas(CacheLineSize) Accumulator {
-    std::int16_t accumulation[2][TransformedFeatureDimensions];
-    std::int32_t psqtAccumulation[2][PSQTBuckets];
-    bool         computed[2];
+    std::int16_t accumulation[COLOR_NB][TransformedFeatureDimensions];
+    std::int32_t psqtAccumulation[COLOR_NB][PSQTBuckets];
+    bool         computed[COLOR_NB];
+};
+
+
+// AccumulatorCaches struct provides per-thread accumulator caches, where each
+// cache contains multiple entries for each of the possible king squares.
+// When the accumulator needs to be refreshed, the cached entry is used to more
+// efficiently update the accumulator, instead of rebuilding it from scratch.
+// This idea, was first described by Luecx (author of Koivisto) and
+// is commonly referred to as "Finny Tables".
+struct AccumulatorCaches {
+
+    struct alignas(CacheLineSize) Cache {
+
+        struct alignas(CacheLineSize) Entry {
+            BiasType       accumulation[COLOR_NB][TransformedFeatureDimensions];
+            PSQTWeightType psqtAccumulation[COLOR_NB][PSQTBuckets];
+            Bitboard       byColorBB[COLOR_NB][COLOR_NB];
+            Bitboard       byTypeBB[COLOR_NB][PIECE_TYPE_NB];
+
+            // To initialize a refresh entry, we set all its bitboards empty,
+            // so we put the biases in the accumulation, without any weights on top
+            void clear(const BiasType* biases) {
+
+                std::memset(byColorBB, 0, sizeof(byColorBB));
+                std::memset(byTypeBB, 0, sizeof(byTypeBB));
+
+                std::memcpy(accumulation[WHITE], biases,
+                            TransformedFeatureDimensions * sizeof(BiasType));
+                std::memcpy(accumulation[BLACK], biases,
+                            TransformedFeatureDimensions * sizeof(BiasType));
+
+                std::memset(psqtAccumulation, 0, sizeof(psqtAccumulation));
+            }
+        };
+
+        template<typename Network>
+        void clear(const Network& network) {
+            for (auto& entry : entries)
+                entry.clear(network.featureTransformer->biases);
+        }
+
+        Entry& operator[](int index) { return entries[index]; }
+
+        std::array<Entry, 9 * 3 * 3> entries;
+    };
+
+    template<typename Network>
+    void clear(const Network& network) {
+        cache.clear(network);
+    }
+
+    Cache cache;
 };
 
 }  // namespace Stockfish::Eval::NNUE
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index c7381137f..1331766fa 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -232,10 +232,10 @@ static constexpr int NumPsqtRegs =
 // Input feature converter
 class FeatureTransformer {
 
-   private:
     // Number of output dimensions for one side
     static constexpr IndexType HalfDimensions = TransformedFeatureDimensions;
 
+   private:
 #ifdef VECTOR
     static constexpr IndexType TileHeight     = NumRegs * sizeof(vec_t) / 2;
     static constexpr IndexType PsqtTileHeight = NumPsqtRegs * sizeof(psqt_vec_t) / 4;
@@ -334,9 +334,12 @@ class FeatureTransformer {
     }
 
     // Convert input features
-    std::int32_t transform(const Position& pos, OutputType* output, int bucket) const {
-        update_accumulator<WHITE>(pos);
-        update_accumulator<BLACK>(pos);
+    std::int32_t transform(const Position&           pos,
+                           AccumulatorCaches::Cache* cache,
+                           OutputType*               output,
+                           int                       bucket) const {
+        update_accumulator<WHITE>(pos, cache);
+        update_accumulator<BLACK>(pos, cache);
 
         const Color perspectives[2]  = {pos.side_to_move(), ~pos.side_to_move()};
         const auto& accumulation     = pos.state()->accumulator.accumulation;
@@ -396,9 +399,9 @@ class FeatureTransformer {
         return psqt;
     }  // end of function transform()
 
-    void hint_common_access(const Position& pos) const {
-        hint_common_access_for_perspective<WHITE>(pos);
-        hint_common_access_for_perspective<BLACK>(pos);
+    void hint_common_access(const Position& pos, AccumulatorCaches::Cache* cache) const {
+        hint_common_access_for_perspective<WHITE>(pos, cache);
+        hint_common_access_for_perspective<BLACK>(pos, cache);
     }
 
    private:
@@ -662,116 +665,148 @@ class FeatureTransformer {
     }
 
     template<Color Perspective>
-    void update_accumulator_refresh(const Position& pos) const {
-#ifdef VECTOR
-        // Gcc-10.2 unnecessarily spills AVX2 registers if this array
-        // is defined in the VECTOR code below, once in each branch
-        vec_t      acc[NumRegs];
-        psqt_vec_t psqt[NumPsqtRegs];
-#endif
+    void update_accumulator_refresh(const Position& pos, AccumulatorCaches::Cache* cache) const {
+        assert(cache != nullptr);
+
+        const Square ksq = pos.square<KING>(Perspective);
+        const int    ab  = pos.count<ADVISOR>(Perspective) * 3 + pos.count<BISHOP>(Perspective);
+
+        auto& entry = (*cache)[FeatureSet::KingCacheMaps[ksq] * 9 + ab];
 
-        // Refresh the accumulator
-        // Could be extracted to a separate function because it's done in 2 places,
-        // but it's unclear if compilers would correctly handle register allocation.
         auto& accumulator                 = pos.state()->accumulator;
         accumulator.computed[Perspective] = true;
-        FeatureSet::IndexList active;
-        FeatureSet::append_active_indices<Perspective>(pos, active);
+
+        FeatureSet::IndexList removed, added;
+        for (Color c : {WHITE, BLACK})
+        {
+            for (PieceType pt = ROOK; pt <= KING; ++pt)
+            {
+                const Piece    piece = make_piece(c, pt);
+                const Bitboard oldBB =
+                  entry.byColorBB[Perspective][c] & entry.byTypeBB[Perspective][pt];
+                const Bitboard newBB    = pos.pieces(c, pt);
+                Bitboard       toRemove = oldBB & ~newBB;
+                Bitboard       toAdd    = newBB & ~oldBB;
+
+                while (toRemove)
+                {
+                    Square sq = pop_lsb(toRemove);
+                    removed.push_back(FeatureSet::make_index<Perspective>(sq, piece, ksq, ab));
+                }
+                while (toAdd)
+                {
+                    Square sq = pop_lsb(toAdd);
+                    added.push_back(FeatureSet::make_index<Perspective>(sq, piece, ksq, ab));
+                }
+            }
+        }
 
 #ifdef VECTOR
+        vec_t      acc[NumRegs];
+        psqt_vec_t psqt[NumPsqtRegs];
+
         for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j)
         {
-            auto biasesTile = reinterpret_cast<const vec_t*>(&biases[j * TileHeight]);
+            auto entryTile =
+              reinterpret_cast<vec_t*>(&entry.accumulation[Perspective][j * TileHeight]);
             for (IndexType k = 0; k < NumRegs; ++k)
-                acc[k] = biasesTile[k];
+                acc[k] = entryTile[k];
 
-            int i = 0;
-            for (; i < int(active.size()) - 1; i += 2)
+            for (int i = 0; i < int(added.size()); ++i)
             {
-                IndexType       index0  = active[i];
-                IndexType       index1  = active[i + 1];
-                const IndexType offset0 = HalfDimensions * index0 + j * TileHeight;
-                const IndexType offset1 = HalfDimensions * index1 + j * TileHeight;
-                auto            column0 = reinterpret_cast<const vec_t*>(&weights[offset0]);
-                auto            column1 = reinterpret_cast<const vec_t*>(&weights[offset1]);
+                IndexType       index  = added[i];
+                const IndexType offset = HalfDimensions * index + j * TileHeight;
+                auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
 
                 for (unsigned k = 0; k < NumRegs; ++k)
-                    acc[k] = vec_add_16(acc[k], vec_add_16(column0[k], column1[k]));
+                    acc[k] = vec_add_16(acc[k], column[k]);
             }
-            for (; i < int(active.size()); ++i)
+            for (int i = 0; i < int(removed.size()); ++i)
             {
-                IndexType       index  = active[i];
+                IndexType       index  = removed[i];
                 const IndexType offset = HalfDimensions * index + j * TileHeight;
                 auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
 
                 for (unsigned k = 0; k < NumRegs; ++k)
-                    acc[k] = vec_add_16(acc[k], column[k]);
+                    acc[k] = vec_sub_16(acc[k], column[k]);
             }
 
-            auto accTile =
-              reinterpret_cast<vec_t*>(&accumulator.accumulation[Perspective][j * TileHeight]);
-            for (unsigned k = 0; k < NumRegs; k++)
-                vec_store(&accTile[k], acc[k]);
+            for (IndexType k = 0; k < NumRegs; k++)
+                vec_store(&entryTile[k], acc[k]);
         }
 
         for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j)
         {
+            auto entryTilePsqt = reinterpret_cast<psqt_vec_t*>(
+              &entry.psqtAccumulation[Perspective][j * PsqtTileHeight]);
             for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                psqt[k] = vec_zero_psqt();
+                psqt[k] = entryTilePsqt[k];
 
-            int i = 0;
-            for (; i < int(active.size()) - 1; i += 2)
+            for (int i = 0; i < int(added.size()); ++i)
             {
-                IndexType       index0  = active[i];
-                IndexType       index1  = active[i + 1];
-                const IndexType offset0 = PSQTBuckets * index0 + j * PsqtTileHeight;
-                const IndexType offset1 = PSQTBuckets * index1 + j * PsqtTileHeight;
-                auto columnPsqt0 = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset0]);
-                auto columnPsqt1 = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset1]);
+                IndexType       index  = added[i];
+                const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight;
+                auto columnPsqt        = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
 
                 for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                    psqt[k] =
-                      vec_add_psqt_32(psqt[k], vec_add_psqt_32(columnPsqt0[k], columnPsqt1[k]));
+                    psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]);
             }
-            for (; i < int(active.size()); ++i)
+            for (int i = 0; i < int(removed.size()); ++i)
             {
-                IndexType       index  = active[i];
+                IndexType       index  = removed[i];
                 const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight;
                 auto columnPsqt        = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
 
                 for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                    psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]);
+                    psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]);
             }
 
-            auto accTilePsqt = reinterpret_cast<psqt_vec_t*>(
-              &accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]);
             for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                vec_store_psqt(&accTilePsqt[k], psqt[k]);
+                vec_store_psqt(&entryTilePsqt[k], psqt[k]);
         }
 
 #else
-        std::memcpy(accumulator.accumulation[Perspective], biases,
-                    HalfDimensions * sizeof(BiasType));
 
-        for (std::size_t k = 0; k < PSQTBuckets; ++k)
-            accumulator.psqtAccumulation[Perspective][k] = 0;
-
-        for (const auto index : active)
+        for (const auto index : added)
         {
             const IndexType offset = HalfDimensions * index;
+            for (IndexType j = 0; j < HalfDimensions; ++j)
+                entry.accumulation[Perspective][j] += weights[offset + j];
 
+            for (std::size_t k = 0; k < PSQTBuckets; ++k)
+                entry.psqtAccumulation[Perspective][k] += psqtWeights[index * PSQTBuckets + k];
+        }
+        for (const auto index : removed)
+        {
+            const IndexType offset = HalfDimensions * index;
             for (IndexType j = 0; j < HalfDimensions; ++j)
-                accumulator.accumulation[Perspective][j] += weights[offset + j];
+                entry.accumulation[Perspective][j] -= weights[offset + j];
 
             for (std::size_t k = 0; k < PSQTBuckets; ++k)
-                accumulator.psqtAccumulation[Perspective][k] +=
-                  psqtWeights[index * PSQTBuckets + k];
+                entry.psqtAccumulation[Perspective][k] -= psqtWeights[index * PSQTBuckets + k];
         }
+
 #endif
+
+        // The accumulator of the refresh entry has been updated.
+        // Now copy its content to the actual accumulator we were refreshing
+
+        std::memcpy(accumulator.psqtAccumulation[Perspective], entry.psqtAccumulation[Perspective],
+                    sizeof(int32_t) * PSQTBuckets);
+
+        std::memcpy(accumulator.accumulation[Perspective], entry.accumulation[Perspective],
+                    sizeof(BiasType) * HalfDimensions);
+
+        for (Color c : {WHITE, BLACK})
+            entry.byColorBB[Perspective][c] = pos.pieces(c);
+
+        for (PieceType pt = ROOK; pt <= KING; ++pt)
+            entry.byTypeBB[Perspective][pt] = pos.pieces(pt);
     }
 
     template<Color Perspective>
-    void hint_common_access_for_perspective(const Position& pos) const {
+    void hint_common_access_for_perspective(const Position&           pos,
+                                            AccumulatorCaches::Cache* cache) const {
 
         // Works like update_accumulator, but performs less work.
         // Updates ONLY the accumulator for pos.
@@ -791,11 +826,11 @@ class FeatureTransformer {
             update_accumulator_incremental<Perspective, 2>(pos, oldest_st, states_to_update);
         }
         else
-            update_accumulator_refresh<Perspective>(pos);
+            update_accumulator_refresh<Perspective>(pos, cache);
     }
 
     template<Color Perspective>
-    void update_accumulator(const Position& pos) const {
+    void update_accumulator(const Position& pos, AccumulatorCaches::Cache* cache) const {
 
         auto [oldest_st, next] = try_find_computed_accumulator<Perspective>(pos);
 
@@ -816,10 +851,12 @@ class FeatureTransformer {
         }
         else
         {
-            update_accumulator_refresh<Perspective>(pos);
+            update_accumulator_refresh<Perspective>(pos, cache);
         }
     }
 
+    friend struct AccumulatorCaches::Cache;
+
     alignas(CacheLineSize) BiasType biases[HalfDimensions];
     alignas(CacheLineSize) WeightType weights[HalfDimensions * InputDimensions];
     alignas(CacheLineSize) PSQTWeightType psqtWeights[InputDimensions * PSQTBuckets];
diff --git a/src/nnue/nnue_misc.cpp b/src/nnue/nnue_misc.cpp
index 8da47e9c3..762fe49e5 100644
--- a/src/nnue/nnue_misc.cpp
+++ b/src/nnue/nnue_misc.cpp
@@ -38,9 +38,11 @@ namespace Stockfish::Eval::NNUE {
 constexpr std::string_view PieceToChar(" RACPNBK racpnbk");
 
 
-void hint_common_parent_position(const Position& pos, const Network& network) {
+void hint_common_parent_position(const Position&    pos,
+                                 const Network&     network,
+                                 AccumulatorCaches& caches) {
 
-    network.hint_common_access(pos);
+    network.hint_common_access(pos, &caches.cache);
 }
 
 namespace {
@@ -96,7 +98,7 @@ void format_cp_aligned_dot(Value v, std::stringstream& stream, const Position& p
 
 // Returns a string with the value of each piece on a board,
 // and a table for (PSQT, Layers) values bucket by bucket.
-std::string trace(Position& pos, const Eval::NNUE::Network& network) {
+std::string trace(Position& pos, const Eval::NNUE::Network& network, AccumulatorCaches& caches) {
 
     std::stringstream ss;
 
@@ -122,7 +124,7 @@ std::string trace(Position& pos, const Eval::NNUE::Network& network) {
 
     // We estimate the value of each piece by doing a differential evaluation from
     // the current base eval, simulating the removal of the piece from its square.
-    Value base = network.evaluate(pos);
+    Value base = network.evaluate(pos, &caches.cache);
     base       = pos.side_to_move() == WHITE ? base : -base;
 
     for (File f = FILE_A; f <= FILE_I; ++f)
@@ -140,7 +142,7 @@ std::string trace(Position& pos, const Eval::NNUE::Network& network) {
                 st->accumulator.computed[WHITE] = false;
                 st->accumulator.computed[BLACK] = false;
 
-                Value eval = network.evaluate(pos);
+                Value eval = network.evaluate(pos, &caches.cache);
                 eval       = pos.side_to_move() == WHITE ? eval : -eval;
                 v          = base - eval;
 
@@ -157,7 +159,7 @@ std::string trace(Position& pos, const Eval::NNUE::Network& network) {
         ss << board[row] << '\n';
     ss << '\n';
 
-    auto t = network.trace_evaluate(pos);
+    auto t = network.trace_evaluate(pos, &caches.cache);
 
     ss << " NNUE network contributions "
        << (pos.side_to_move() == WHITE ? "(White to move)" : "(Black to move)") << std::endl
@@ -171,11 +173,14 @@ std::string trace(Position& pos, const Eval::NNUE::Network& network) {
         ss << "|  " << bucket << "        ";
         ss << " |  ";
         format_cp_aligned_dot(t.psqt[bucket], ss, pos);
-        ss << "  " << " |  ";
+        ss << "  "
+           << " |  ";
         format_cp_aligned_dot(t.positional[bucket], ss, pos);
-        ss << "  " << " |  ";
+        ss << "  "
+           << " |  ";
         format_cp_aligned_dot(t.psqt[bucket] + t.positional[bucket], ss, pos);
-        ss << "  " << " |";
+        ss << "  "
+           << " |";
         if (bucket == t.correctBucket)
             ss << " <-- this bucket is used";
         ss << '\n';
diff --git a/src/nnue/nnue_misc.h b/src/nnue/nnue_misc.h
index ce9960a36..e48516f10 100644
--- a/src/nnue/nnue_misc.h
+++ b/src/nnue/nnue_misc.h
@@ -47,12 +47,13 @@ struct NnueEvalTrace {
     std::size_t correctBucket;
 };
 
-
 class Network;
+struct AccumulatorCaches;
 
-
-std::string trace(Position& pos, const Network& network);
-void        hint_common_parent_position(const Position& pos, const Network& network);
+std::string trace(Position& pos, const Network& network, AccumulatorCaches& caches);
+void        hint_common_parent_position(const Position&    pos,
+                                        const Network&     network,
+                                        AccumulatorCaches& caches);
 
 }  // namespace Stockfish::Eval::NNUE
 }  // namespace Stockfish
diff --git a/src/search.cpp b/src/search.cpp
index 6fb9a9632..1a28e5d40 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -33,6 +33,8 @@
 #include "misc.h"
 #include "movegen.h"
 #include "movepick.h"
+#include "nnue/network.h"
+#include "nnue/nnue_accumulator.h"
 #include "nnue/nnue_common.h"
 #include "nnue/nnue_misc.h"
 #include "position.h"
@@ -108,6 +110,7 @@ Search::Worker::Worker(SharedState&                    sharedState,
     // Unpack the SharedState struct into member variables
     thread_idx(thread_id),
     manager(std::move(sm)),
+    refreshTable(),
     options(sharedState.options),
     threads(sharedState.threads),
     tt(sharedState.tt),
@@ -116,6 +119,10 @@ Search::Worker::Worker(SharedState&                    sharedState,
 }
 
 void Search::Worker::start_searching() {
+
+    // Initialize accumulator refresh entries
+    refreshTable.clear(network);
+
     // Non-main threads go directly to iterative_deepening()
     if (!is_mainthread())
     {
@@ -513,7 +520,7 @@ Value Search::Worker::search(
 
         if (threads.stop.load(std::memory_order_relaxed) || ss->ply >= MAX_PLY)
             return (ss->ply >= MAX_PLY && !ss->inCheck)
-                   ? evaluate(network, pos, thisThread->optimism[us])
+                   ? evaluate(network, pos, refreshTable, thisThread->optimism[us])
                    : value_draw(thisThread->nodes);
 
         // Step 3. Mate distance pruning. Even if we mate at the next move our score
@@ -594,7 +601,7 @@ Value Search::Worker::search(
     {
         // Providing the hint that this node's accumulator will be used often
         // brings significant Elo gain (~13 Elo).
-        Eval::NNUE::hint_common_parent_position(pos, network);
+        Eval::NNUE::hint_common_parent_position(pos, network, refreshTable);
         unadjustedStaticEval = eval = ss->staticEval;
     }
     else if (ss->ttHit)
@@ -602,9 +609,9 @@ Value Search::Worker::search(
         // Never assume anything about values stored in TT
         unadjustedStaticEval = tte->eval();
         if (unadjustedStaticEval == VALUE_NONE)
-            unadjustedStaticEval = evaluate(network, pos, thisThread->optimism[us]);
+            unadjustedStaticEval = evaluate(network, pos, refreshTable, thisThread->optimism[us]);
         else if (PvNode)
-            Eval::NNUE::hint_common_parent_position(pos, network);
+            Eval::NNUE::hint_common_parent_position(pos, network, refreshTable);
 
         ss->staticEval = eval = to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos);
 
@@ -614,7 +621,7 @@ Value Search::Worker::search(
     }
     else
     {
-        unadjustedStaticEval = evaluate(network, pos, thisThread->optimism[us]);
+        unadjustedStaticEval = evaluate(network, pos, refreshTable, thisThread->optimism[us]);
         ss->staticEval = eval = to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos);
 
         // Static evaluation is saved as it was before adjustment by correction history
@@ -771,7 +778,7 @@ Value Search::Worker::search(
                 }
             }
 
-        Eval::NNUE::hint_common_parent_position(pos, network);
+        Eval::NNUE::hint_common_parent_position(pos, network, refreshTable);
     }
 
 moves_loop:  // When in check, search starts here
@@ -1320,7 +1327,8 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta,
     }
 
     if (ss->ply >= MAX_PLY)
-        return !ss->inCheck ? evaluate(network, pos, thisThread->optimism[us]) : VALUE_DRAW;
+        return !ss->inCheck ? evaluate(network, pos, refreshTable, thisThread->optimism[us])
+                            : VALUE_DRAW;
 
     assert(0 <= ss->ply && ss->ply < MAX_PLY);
 
@@ -1351,7 +1359,8 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta,
             // Never assume anything about values stored in TT
             unadjustedStaticEval = tte->eval();
             if (unadjustedStaticEval == VALUE_NONE)
-                unadjustedStaticEval = evaluate(network, pos, thisThread->optimism[us]);
+                unadjustedStaticEval =
+                  evaluate(network, pos, refreshTable, thisThread->optimism[us]);
             ss->staticEval = bestValue =
               to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos);
 
@@ -1364,7 +1373,7 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta,
         {
             // In case of null move search, use previous static eval with a different sign
             unadjustedStaticEval = (ss - 1)->currentMove != Move::null()
-                                   ? evaluate(network, pos, thisThread->optimism[us])
+                                   ? evaluate(network, pos, refreshTable, thisThread->optimism[us])
                                    : -(ss - 1)->staticEval;
             ss->staticEval       = bestValue =
               to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos);
diff --git a/src/search.h b/src/search.h
index 28edf6b4e..e09eae7b5 100644
--- a/src/search.h
+++ b/src/search.h
@@ -26,9 +26,9 @@
 #include <cstdint>
 #include <functional>
 #include <memory>
+#include <string>
 #include <string_view>
 #include <vector>
-#include <string>
 
 #include "misc.h"
 #include "movepick.h"
@@ -36,6 +36,7 @@
 #include "score.h"
 #include "timeman.h"
 #include "types.h"
+#include "nnue/nnue_accumulator.h"
 
 namespace Stockfish {
 
@@ -295,6 +296,10 @@ class Worker {
     // The main thread has a SearchManager, the others have a NullSearchManager
     std::unique_ptr<ISearchManager> manager;
 
+    // Used by NNUE
+
+    Eval::NNUE::AccumulatorCaches refreshTable;
+
     const OptionsMap&          options;
     ThreadPool&                threads;
     TranspositionTable&        tt;
diff --git a/src/uci.cpp b/src/uci.cpp
index f13fadad7..4556bf087 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -275,9 +275,9 @@ void UCIEngine::bench(std::istream& args) {
 
     dbg_print();
 
-    std::cerr << "\n===========================" << "\nTotal time (ms) : " << elapsed
-              << "\nNodes searched  : " << nodes << "\nNodes/second    : " << 1000 * nodes / elapsed
-              << std::endl;
+    std::cerr << "\n==========================="
+              << "\nTotal time (ms) : " << elapsed << "\nNodes searched  : " << nodes
+              << "\nNodes/second    : " << 1000 * nodes / elapsed << std::endl;
 
     // reset callback, to not capture a dangling reference to nodesSearched
     engine.set_on_update_full([&](const auto& i) { on_update_full(i, options["UCI_ShowWDL"]); });
@@ -290,7 +290,7 @@ void UCIEngine::setoption(std::istringstream& is) {
 }
 
 std::uint64_t UCIEngine::perft(const Search::LimitsType& limits) {
-    auto nodes = engine.perft(engine.fen(), limits.perft, engine.get_options()["UCI_Chess960"]);
+    auto nodes = engine.perft(engine.fen(), limits.perft);
     sync_cout << "\nNodes searched: " << nodes << "\n" << sync_endl;
     return nodes;
 }