diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 1363aa14a..c579a6487 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -25,12 +25,14 @@
 #include <iomanip>
 #include <iostream>
 #include <sstream>
+#include <memory>
 
 #include "nnue/network.h"
 #include "nnue/nnue_misc.h"
 #include "position.h"
 #include "types.h"
 #include "uci.h"
+#include "nnue/nnue_accumulator.h"
 
 namespace Stockfish {
 
@@ -46,7 +48,10 @@ int Eval::simple_eval(const Position& pos, Color c) {
 
 // Evaluate is the evaluator for the outer world. It returns a static evaluation
 // of the position from the point of view of the side to move.
-Value Eval::evaluate(const Eval::NNUE::Network& network, const Position& pos, int optimism) {
+Value Eval::evaluate(const Eval::NNUE::Network& network,
+                     const Position&            pos,
+                     NNUE::AccumulatorCaches&   caches,
+                     int                        optimism) {
 
     assert(!pos.checkers());
 
@@ -56,7 +61,7 @@ Value Eval::evaluate(const Eval::NNUE::Network& network, const Position& pos, in
     int   simpleEval = simple_eval(pos, stm);
 
     int   nnueComplexity;
-    Value nnue = network.evaluate(pos, true, &nnueComplexity);
+    Value nnue = network.evaluate(pos, &caches.cache, true, &nnueComplexity);
 
     // Blend optimism and eval with nnue complexity and material imbalance
     optimism += optimism * (nnueComplexity + std::abs(simpleEval - nnue)) / 729;
@@ -80,21 +85,24 @@ Value Eval::evaluate(const Eval::NNUE::Network& network, const Position& pos, in
 // Trace scores are from white's point of view
 std::string Eval::trace(Position& pos, const Eval::NNUE::Network& network) {
 
+    auto caches = std::make_unique<Eval::NNUE::AccumulatorCaches>();
+    caches->clear(network);
+
     if (pos.checkers())
         return "Final evaluation: none (in check)";
 
     std::stringstream ss;
     ss << std::showpoint << std::noshowpos << std::fixed << std::setprecision(2);
 
-    ss << '\n' << NNUE::trace(pos, network) << '\n';
+    ss << '\n' << NNUE::trace(pos, network, *caches) << '\n';
 
     ss << std::showpoint << std::showpos << std::fixed << std::setprecision(2) << std::setw(15);
 
-    Value v = network.evaluate(pos);
+    Value v = network.evaluate(pos, &caches->cache);
     v       = pos.side_to_move() == WHITE ? v : -v;
     ss << "NNUE evaluation        " << 0.01 * UCIEngine::to_cp(v, pos) << " (white side)\n";
 
-    v = evaluate(network, pos, VALUE_ZERO);
+    v = evaluate(network, pos, *caches, VALUE_ZERO);
     v = pos.side_to_move() == WHITE ? v : -v;
     ss << "Final evaluation       " << 0.01 * UCIEngine::to_cp(v, pos) << " (white side)";
     ss << " [with scaled NNUE, ...]";
diff --git a/src/evaluate.h b/src/evaluate.h
index 60fea5f6d..25b94ca64 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -37,12 +37,16 @@ namespace Eval {
 
 namespace NNUE {
 class Network;
+struct AccumulatorCaches;
 }
 
 std::string trace(Position& pos, const Eval::NNUE::Network& network);
 
 int   simple_eval(const Position& pos, Color c);
-Value evaluate(const NNUE::Network& network, const Position& pos, int optimism);
+Value evaluate(const NNUE::Network&           network,
+               const Position&                pos,
+               Eval::NNUE::AccumulatorCaches& caches,
+               int                            optimism);
 
 }  // namespace Eval
 
diff --git a/src/nnue/features/half_ka_v2_hm.cpp b/src/nnue/features/half_ka_v2_hm.cpp
index f01bc6eff..0a16df1b3 100644
--- a/src/nnue/features/half_ka_v2_hm.cpp
+++ b/src/nnue/features/half_ka_v2_hm.cpp
@@ -23,7 +23,7 @@
 #include "../../bitboard.h"
 #include "../../position.h"
 #include "../../types.h"
-#include "../nnue_common.h"
+#include "../nnue_accumulator.h"
 
 namespace Stockfish::Eval::NNUE::Features {
 
@@ -36,22 +36,9 @@ inline IndexType HalfKAv2_hm::make_index(Square s, Piece pc, Square ksq, int ab)
                      + PS_NB * ((KingBuckets[ksq] & 0x7) * 9 + ab));
 }
 
-// Get a list of indices for active features
-template<Color Perspective>
-void HalfKAv2_hm::append_active_indices(const Position& pos, IndexList& active) {
-    Square   ksq = pos.square<KING>(Perspective);
-    int      ab  = pos.count<ADVISOR>(Perspective) * 3 + pos.count<BISHOP>(Perspective);
-    Bitboard bb  = pos.pieces();
-    while (bb)
-    {
-        Square s = pop_lsb(bb);
-        active.push_back(make_index<Perspective>(s, pos.piece_on(s), ksq, ab));
-    }
-}
-
 // Explicit template instantiations
-template void HalfKAv2_hm::append_active_indices<WHITE>(const Position& pos, IndexList& active);
-template void HalfKAv2_hm::append_active_indices<BLACK>(const Position& pos, IndexList& active);
+template IndexType HalfKAv2_hm::make_index<WHITE>(Square s, Piece pc, Square ksq, int ab);
+template IndexType HalfKAv2_hm::make_index<BLACK>(Square s, Piece pc, Square ksq, int ab);
 
 // Get a list of indices for recently changed features
 template<Color Perspective>
diff --git a/src/nnue/features/half_ka_v2_hm.h b/src/nnue/features/half_ka_v2_hm.h
index 23f4e505c..5259ece36 100644
--- a/src/nnue/features/half_ka_v2_hm.h
+++ b/src/nnue/features/half_ka_v2_hm.h
@@ -64,10 +64,6 @@ class HalfKAv2_hm {
     };
     // clang-format on
 
-    // Index of a feature for a given king position and another piece on some square
-    template<Color Perspective>
-    static IndexType make_index(Square s, Piece pc, Square ksq, int ab);
-
    public:
     // Feature name
     static constexpr const char* Name = "HalfKAv2_hm";
@@ -95,6 +91,19 @@ class HalfKAv2_hm {
     };
 #undef M
 
+    static constexpr uint8_t KingCacheMaps[SQUARE_NB] = {
+        0,  0,  0,  0,  1,  2,  0,  0,  0,
+        0,  0,  0,  5,  4,  3,  0,  0,  0,
+        0,  0,  0,  6,  7,  8,  0,  0,  0,
+        0,  0,  0,  0,  0,  0,  0,  0,  0,
+        0,  0,  0,  0,  0,  0,  0,  0,  0,
+        0,  0,  0,  0,  0,  0,  0,  0,  0,
+        0,  0,  0,  0,  0,  0,  0,  0,  0,
+        0,  0,  0,  6,  7,  8,  0,  0,  0,
+        0,  0,  0,  5,  4,  3,  0,  0,  0,
+        0,  0,  0,  0,  1,  2,  0,  0,  0,
+    };
+
     // Map advisor and bishop location into White King plane
     static constexpr uint8_t ABMap[SQUARE_NB] = {
         0,  0,  0,  1,  0,  2,  5,  0,  0,
@@ -133,9 +142,9 @@ class HalfKAv2_hm {
     static constexpr IndexType MaxActiveDimensions = 32;
     using IndexList                                = ValueList<IndexType, MaxActiveDimensions>;
 
-    // Get a list of indices for active features
+    // Index of a feature for a given king position and another piece on some square
     template<Color Perspective>
-    static void append_active_indices(const Position& pos, IndexList& active);
+    static IndexType make_index(Square s, Piece pc, Square ksq, int ab);
 
     // Get a list of indices for recently changed features
     template<Color Perspective>
diff --git a/src/nnue/network.cpp b/src/nnue/network.cpp
index 5ec42b05f..93dbeaae3 100644
--- a/src/nnue/network.cpp
+++ b/src/nnue/network.cpp
@@ -126,7 +126,10 @@ bool Network::save(const std::optional<std::string>& filename) const {
 }
 
 
-Value Network::evaluate(const Position& pos, bool adjusted, int* complexity) const {
+Value Network::evaluate(const Position&           pos,
+                        AccumulatorCaches::Cache* cache,
+                        bool                      adjusted,
+                        int*                      complexity) const {
     // We manually align the arrays on the stack because with gcc < 9.3
     // overaligning stack variables with alignas() doesn't work correctly.
 
@@ -145,7 +148,7 @@ Value Network::evaluate(const Position& pos, bool adjusted, int* complexity) con
     ASSERT_ALIGNED(transformedFeatures, alignment);
 
     const int  bucket     = (pos.count<ALL_PIECES>() - 1) / 4;
-    const auto psqt       = featureTransformer->transform(pos, transformedFeatures, bucket);
+    const auto psqt       = featureTransformer->transform(pos, cache, transformedFeatures, bucket);
     const auto positional = network[bucket]->propagate(transformedFeatures);
 
     if (complexity)
@@ -188,12 +191,12 @@ void Network::verify(std::string evalfilePath) const {
 }
 
 
-void Network::hint_common_access(const Position& pos) const {
-    featureTransformer->hint_common_access(pos);
+void Network::hint_common_access(const Position& pos, AccumulatorCaches::Cache* cache) const {
+    featureTransformer->hint_common_access(pos, cache);
 }
 
 
-NnueEvalTrace Network::trace_evaluate(const Position& pos) const {
+NnueEvalTrace Network::trace_evaluate(const Position& pos, AccumulatorCaches::Cache* cache) const {
     // We manually align the arrays on the stack because with gcc < 9.3
     // overaligning stack variables with alignas() doesn't work correctly.
     constexpr uint64_t alignment = CacheLineSize;
@@ -214,8 +217,9 @@ NnueEvalTrace Network::trace_evaluate(const Position& pos) const {
     t.correctBucket = (pos.count<ALL_PIECES>() - 1) / 4;
     for (IndexType bucket = 0; bucket < LayerStacks; ++bucket)
     {
-        const auto materialist = featureTransformer->transform(pos, transformedFeatures, bucket);
-        const auto positional  = network[bucket]->propagate(transformedFeatures);
+        const auto materialist =
+          featureTransformer->transform(pos, cache, transformedFeatures, bucket);
+        const auto positional = network[bucket]->propagate(transformedFeatures);
 
         t.psqt[bucket]       = static_cast<Value>(materialist / OutputScale);
         t.positional[bucket] = static_cast<Value>(positional / OutputScale);
diff --git a/src/nnue/network.h b/src/nnue/network.h
index 13ecab11c..c489785f1 100644
--- a/src/nnue/network.h
+++ b/src/nnue/network.h
@@ -26,6 +26,7 @@
 #include "nnue_architecture.h"
 #include "nnue_feature_transformer.h"
 #include "nnue_misc.h"
+#include "nnue_accumulator.h"
 
 namespace Stockfish {
 
@@ -42,13 +43,16 @@ class Network {
     bool save(const std::optional<std::string>& filename) const;
 
 
-    Value evaluate(const Position& pos, bool adjusted = false, int* complexity = nullptr) const;
+    Value evaluate(const Position&           pos,
+                   AccumulatorCaches::Cache* cache,
+                   bool                      adjusted   = false,
+                   int*                      complexity = nullptr) const;
 
 
-    void hint_common_access(const Position& pos) const;
+    void hint_common_access(const Position& pos, AccumulatorCaches::Cache* cache) const;
 
     void          verify(std::string evalfilePath) const;
-    NnueEvalTrace trace_evaluate(const Position& pos) const;
+    NnueEvalTrace trace_evaluate(const Position& pos, AccumulatorCaches::Cache* cache) const;
 
    private:
     void load_user_net(const std::string&, const std::string&);
@@ -75,6 +79,8 @@ class Network {
     // Hash value of evaluation function structure
     static constexpr std::uint32_t hash =
       FeatureTransformer::get_hash_value() ^ NetworkArchitecture::get_hash_value();
+
+    friend struct AccumulatorCaches::Cache;
 };
 
 }  // namespace Stockfish::Eval::NNUE
diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h
index f6d705243..f3ab1a0a8 100644
--- a/src/nnue/nnue_accumulator.h
+++ b/src/nnue/nnue_accumulator.h
@@ -28,11 +28,67 @@
 
 namespace Stockfish::Eval::NNUE {
 
+using BiasType       = std::int16_t;
+using PSQTWeightType = std::int32_t;
+using IndexType      = std::uint32_t;
+
 // Class that holds the result of affine transformation of input features
 struct alignas(CacheLineSize) Accumulator {
-    std::int16_t accumulation[2][TransformedFeatureDimensions];
-    std::int32_t psqtAccumulation[2][PSQTBuckets];
-    bool         computed[2];
+    std::int16_t accumulation[COLOR_NB][TransformedFeatureDimensions];
+    std::int32_t psqtAccumulation[COLOR_NB][PSQTBuckets];
+    bool         computed[COLOR_NB];
+};
+
+
+// AccumulatorCaches struct provides per-thread accumulator caches, where each
+// cache contains multiple entries for each of the possible king squares.
+// When the accumulator needs to be refreshed, the cached entry is used to more
+// efficiently update the accumulator, instead of rebuilding it from scratch.
+// This idea, was first described by Luecx (author of Koivisto) and
+// is commonly referred to as "Finny Tables".
+struct AccumulatorCaches {
+
+    struct alignas(CacheLineSize) Cache {
+
+        struct alignas(CacheLineSize) Entry {
+            BiasType       accumulation[COLOR_NB][TransformedFeatureDimensions];
+            PSQTWeightType psqtAccumulation[COLOR_NB][PSQTBuckets];
+            Bitboard       byColorBB[COLOR_NB][COLOR_NB];
+            Bitboard       byTypeBB[COLOR_NB][PIECE_TYPE_NB];
+
+            // To initialize a refresh entry, we set all its bitboards empty,
+            // so we put the biases in the accumulation, without any weights on top
+            void clear(const BiasType* biases) {
+
+                std::memset(byColorBB, 0, sizeof(byColorBB));
+                std::memset(byTypeBB, 0, sizeof(byTypeBB));
+
+                std::memcpy(accumulation[WHITE], biases,
+                            TransformedFeatureDimensions * sizeof(BiasType));
+                std::memcpy(accumulation[BLACK], biases,
+                            TransformedFeatureDimensions * sizeof(BiasType));
+
+                std::memset(psqtAccumulation, 0, sizeof(psqtAccumulation));
+            }
+        };
+
+        template<typename Network>
+        void clear(const Network& network) {
+            for (auto& entry : entries)
+                entry.clear(network.featureTransformer->biases);
+        }
+
+        Entry& operator[](int index) { return entries[index]; }
+
+        std::array<Entry, 9 * 3 * 3> entries;
+    };
+
+    template<typename Network>
+    void clear(const Network& network) {
+        cache.clear(network);
+    }
+
+    Cache cache;
 };
 
 }  // namespace Stockfish::Eval::NNUE
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index c7381137f..1331766fa 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -232,10 +232,10 @@ static constexpr int NumPsqtRegs =
 // Input feature converter
 class FeatureTransformer {
 
-   private:
     // Number of output dimensions for one side
     static constexpr IndexType HalfDimensions = TransformedFeatureDimensions;
 
+   private:
 #ifdef VECTOR
     static constexpr IndexType TileHeight     = NumRegs * sizeof(vec_t) / 2;
     static constexpr IndexType PsqtTileHeight = NumPsqtRegs * sizeof(psqt_vec_t) / 4;
@@ -334,9 +334,12 @@ class FeatureTransformer {
     }
 
     // Convert input features
-    std::int32_t transform(const Position& pos, OutputType* output, int bucket) const {
-        update_accumulator<WHITE>(pos);
-        update_accumulator<BLACK>(pos);
+    std::int32_t transform(const Position&           pos,
+                           AccumulatorCaches::Cache* cache,
+                           OutputType*               output,
+                           int                       bucket) const {
+        update_accumulator<WHITE>(pos, cache);
+        update_accumulator<BLACK>(pos, cache);
 
         const Color perspectives[2]  = {pos.side_to_move(), ~pos.side_to_move()};
         const auto& accumulation     = pos.state()->accumulator.accumulation;
@@ -396,9 +399,9 @@ class FeatureTransformer {
         return psqt;
     }  // end of function transform()
 
-    void hint_common_access(const Position& pos) const {
-        hint_common_access_for_perspective<WHITE>(pos);
-        hint_common_access_for_perspective<BLACK>(pos);
+    void hint_common_access(const Position& pos, AccumulatorCaches::Cache* cache) const {
+        hint_common_access_for_perspective<WHITE>(pos, cache);
+        hint_common_access_for_perspective<BLACK>(pos, cache);
     }
 
    private:
@@ -662,116 +665,148 @@ class FeatureTransformer {
     }
 
     template<Color Perspective>
-    void update_accumulator_refresh(const Position& pos) const {
-#ifdef VECTOR
-        // Gcc-10.2 unnecessarily spills AVX2 registers if this array
-        // is defined in the VECTOR code below, once in each branch
-        vec_t      acc[NumRegs];
-        psqt_vec_t psqt[NumPsqtRegs];
-#endif
+    void update_accumulator_refresh(const Position& pos, AccumulatorCaches::Cache* cache) const {
+        assert(cache != nullptr);
+
+        const Square ksq = pos.square<KING>(Perspective);
+        const int    ab  = pos.count<ADVISOR>(Perspective) * 3 + pos.count<BISHOP>(Perspective);
+
+        auto& entry = (*cache)[FeatureSet::KingCacheMaps[ksq] * 9 + ab];
 
-        // Refresh the accumulator
-        // Could be extracted to a separate function because it's done in 2 places,
-        // but it's unclear if compilers would correctly handle register allocation.
         auto& accumulator                 = pos.state()->accumulator;
         accumulator.computed[Perspective] = true;
-        FeatureSet::IndexList active;
-        FeatureSet::append_active_indices<Perspective>(pos, active);
+
+        FeatureSet::IndexList removed, added;
+        for (Color c : {WHITE, BLACK})
+        {
+            for (PieceType pt = ROOK; pt <= KING; ++pt)
+            {
+                const Piece    piece = make_piece(c, pt);
+                const Bitboard oldBB =
+                  entry.byColorBB[Perspective][c] & entry.byTypeBB[Perspective][pt];
+                const Bitboard newBB    = pos.pieces(c, pt);
+                Bitboard       toRemove = oldBB & ~newBB;
+                Bitboard       toAdd    = newBB & ~oldBB;
+
+                while (toRemove)
+                {
+                    Square sq = pop_lsb(toRemove);
+                    removed.push_back(FeatureSet::make_index<Perspective>(sq, piece, ksq, ab));
+                }
+                while (toAdd)
+                {
+                    Square sq = pop_lsb(toAdd);
+                    added.push_back(FeatureSet::make_index<Perspective>(sq, piece, ksq, ab));
+                }
+            }
+        }
 
 #ifdef VECTOR
+        vec_t      acc[NumRegs];
+        psqt_vec_t psqt[NumPsqtRegs];
+
         for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j)
         {
-            auto biasesTile = reinterpret_cast<const vec_t*>(&biases[j * TileHeight]);
+            auto entryTile =
+              reinterpret_cast<vec_t*>(&entry.accumulation[Perspective][j * TileHeight]);
             for (IndexType k = 0; k < NumRegs; ++k)
-                acc[k] = biasesTile[k];
+                acc[k] = entryTile[k];
 
-            int i = 0;
-            for (; i < int(active.size()) - 1; i += 2)
+            for (int i = 0; i < int(added.size()); ++i)
             {
-                IndexType       index0  = active[i];
-                IndexType       index1  = active[i + 1];
-                const IndexType offset0 = HalfDimensions * index0 + j * TileHeight;
-                const IndexType offset1 = HalfDimensions * index1 + j * TileHeight;
-                auto            column0 = reinterpret_cast<const vec_t*>(&weights[offset0]);
-                auto            column1 = reinterpret_cast<const vec_t*>(&weights[offset1]);
+                IndexType       index  = added[i];
+                const IndexType offset = HalfDimensions * index + j * TileHeight;
+                auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
 
                 for (unsigned k = 0; k < NumRegs; ++k)
-                    acc[k] = vec_add_16(acc[k], vec_add_16(column0[k], column1[k]));
+                    acc[k] = vec_add_16(acc[k], column[k]);
             }
-            for (; i < int(active.size()); ++i)
+            for (int i = 0; i < int(removed.size()); ++i)
             {
-                IndexType       index  = active[i];
+                IndexType       index  = removed[i];
                 const IndexType offset = HalfDimensions * index + j * TileHeight;
                 auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
 
                 for (unsigned k = 0; k < NumRegs; ++k)
-                    acc[k] = vec_add_16(acc[k], column[k]);
+                    acc[k] = vec_sub_16(acc[k], column[k]);
             }
 
-            auto accTile =
-              reinterpret_cast<vec_t*>(&accumulator.accumulation[Perspective][j * TileHeight]);
-            for (unsigned k = 0; k < NumRegs; k++)
-                vec_store(&accTile[k], acc[k]);
+            for (IndexType k = 0; k < NumRegs; k++)
+                vec_store(&entryTile[k], acc[k]);
         }
 
         for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j)
         {
+            auto entryTilePsqt = reinterpret_cast<psqt_vec_t*>(
+              &entry.psqtAccumulation[Perspective][j * PsqtTileHeight]);
             for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                psqt[k] = vec_zero_psqt();
+                psqt[k] = entryTilePsqt[k];
 
-            int i = 0;
-            for (; i < int(active.size()) - 1; i += 2)
+            for (int i = 0; i < int(added.size()); ++i)
             {
-                IndexType       index0  = active[i];
-                IndexType       index1  = active[i + 1];
-                const IndexType offset0 = PSQTBuckets * index0 + j * PsqtTileHeight;
-                const IndexType offset1 = PSQTBuckets * index1 + j * PsqtTileHeight;
-                auto columnPsqt0 = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset0]);
-                auto columnPsqt1 = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset1]);
+                IndexType       index  = added[i];
+                const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight;
+                auto columnPsqt        = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
 
                 for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                    psqt[k] =
-                      vec_add_psqt_32(psqt[k], vec_add_psqt_32(columnPsqt0[k], columnPsqt1[k]));
+                    psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]);
             }
-            for (; i < int(active.size()); ++i)
+            for (int i = 0; i < int(removed.size()); ++i)
             {
-                IndexType       index  = active[i];
+                IndexType       index  = removed[i];
                 const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight;
                 auto columnPsqt        = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
 
                 for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                    psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]);
+                    psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]);
             }
 
-            auto accTilePsqt = reinterpret_cast<psqt_vec_t*>(
-              &accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]);
             for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                vec_store_psqt(&accTilePsqt[k], psqt[k]);
+                vec_store_psqt(&entryTilePsqt[k], psqt[k]);
         }
 
 #else
-        std::memcpy(accumulator.accumulation[Perspective], biases,
-                    HalfDimensions * sizeof(BiasType));
 
-        for (std::size_t k = 0; k < PSQTBuckets; ++k)
-            accumulator.psqtAccumulation[Perspective][k] = 0;
-
-        for (const auto index : active)
+        for (const auto index : added)
         {
             const IndexType offset = HalfDimensions * index;
+            for (IndexType j = 0; j < HalfDimensions; ++j)
+                entry.accumulation[Perspective][j] += weights[offset + j];
 
+            for (std::size_t k = 0; k < PSQTBuckets; ++k)
+                entry.psqtAccumulation[Perspective][k] += psqtWeights[index * PSQTBuckets + k];
+        }
+        for (const auto index : removed)
+        {
+            const IndexType offset = HalfDimensions * index;
             for (IndexType j = 0; j < HalfDimensions; ++j)
-                accumulator.accumulation[Perspective][j] += weights[offset + j];
+                entry.accumulation[Perspective][j] -= weights[offset + j];
 
             for (std::size_t k = 0; k < PSQTBuckets; ++k)
-                accumulator.psqtAccumulation[Perspective][k] +=
-                  psqtWeights[index * PSQTBuckets + k];
+                entry.psqtAccumulation[Perspective][k] -= psqtWeights[index * PSQTBuckets + k];
         }
+
 #endif
+
+        // The accumulator of the refresh entry has been updated.
+        // Now copy its content to the actual accumulator we were refreshing
+
+        std::memcpy(accumulator.psqtAccumulation[Perspective], entry.psqtAccumulation[Perspective],
+                    sizeof(int32_t) * PSQTBuckets);
+
+        std::memcpy(accumulator.accumulation[Perspective], entry.accumulation[Perspective],
+                    sizeof(BiasType) * HalfDimensions);
+
+        for (Color c : {WHITE, BLACK})
+            entry.byColorBB[Perspective][c] = pos.pieces(c);
+
+        for (PieceType pt = ROOK; pt <= KING; ++pt)
+            entry.byTypeBB[Perspective][pt] = pos.pieces(pt);
     }
 
     template<Color Perspective>
-    void hint_common_access_for_perspective(const Position& pos) const {
+    void hint_common_access_for_perspective(const Position&           pos,
+                                            AccumulatorCaches::Cache* cache) const {
 
         // Works like update_accumulator, but performs less work.
         // Updates ONLY the accumulator for pos.
@@ -791,11 +826,11 @@ class FeatureTransformer {
             update_accumulator_incremental<Perspective, 2>(pos, oldest_st, states_to_update);
         }
         else
-            update_accumulator_refresh<Perspective>(pos);
+            update_accumulator_refresh<Perspective>(pos, cache);
     }
 
     template<Color Perspective>
-    void update_accumulator(const Position& pos) const {
+    void update_accumulator(const Position& pos, AccumulatorCaches::Cache* cache) const {
 
         auto [oldest_st, next] = try_find_computed_accumulator<Perspective>(pos);
 
@@ -816,10 +851,12 @@ class FeatureTransformer {
         }
         else
         {
-            update_accumulator_refresh<Perspective>(pos);
+            update_accumulator_refresh<Perspective>(pos, cache);
         }
     }
 
+    friend struct AccumulatorCaches::Cache;
+
     alignas(CacheLineSize) BiasType biases[HalfDimensions];
     alignas(CacheLineSize) WeightType weights[HalfDimensions * InputDimensions];
     alignas(CacheLineSize) PSQTWeightType psqtWeights[InputDimensions * PSQTBuckets];
diff --git a/src/nnue/nnue_misc.cpp b/src/nnue/nnue_misc.cpp
index 8da47e9c3..762fe49e5 100644
--- a/src/nnue/nnue_misc.cpp
+++ b/src/nnue/nnue_misc.cpp
@@ -38,9 +38,11 @@ namespace Stockfish::Eval::NNUE {
 constexpr std::string_view PieceToChar(" RACPNBK racpnbk");
 
 
-void hint_common_parent_position(const Position& pos, const Network& network) {
+void hint_common_parent_position(const Position&    pos,
+                                 const Network&     network,
+                                 AccumulatorCaches& caches) {
 
-    network.hint_common_access(pos);
+    network.hint_common_access(pos, &caches.cache);
 }
 
 namespace {
@@ -96,7 +98,7 @@ void format_cp_aligned_dot(Value v, std::stringstream& stream, const Position& p
 
 // Returns a string with the value of each piece on a board,
 // and a table for (PSQT, Layers) values bucket by bucket.
-std::string trace(Position& pos, const Eval::NNUE::Network& network) {
+std::string trace(Position& pos, const Eval::NNUE::Network& network, AccumulatorCaches& caches) {
 
     std::stringstream ss;
 
@@ -122,7 +124,7 @@ std::string trace(Position& pos, const Eval::NNUE::Network& network) {
 
     // We estimate the value of each piece by doing a differential evaluation from
     // the current base eval, simulating the removal of the piece from its square.
-    Value base = network.evaluate(pos);
+    Value base = network.evaluate(pos, &caches.cache);
     base       = pos.side_to_move() == WHITE ? base : -base;
 
     for (File f = FILE_A; f <= FILE_I; ++f)
@@ -140,7 +142,7 @@ std::string trace(Position& pos, const Eval::NNUE::Network& network) {
                 st->accumulator.computed[WHITE] = false;
                 st->accumulator.computed[BLACK] = false;
 
-                Value eval = network.evaluate(pos);
+                Value eval = network.evaluate(pos, &caches.cache);
                 eval       = pos.side_to_move() == WHITE ? eval : -eval;
                 v          = base - eval;
 
@@ -157,7 +159,7 @@ std::string trace(Position& pos, const Eval::NNUE::Network& network) {
         ss << board[row] << '\n';
     ss << '\n';
 
-    auto t = network.trace_evaluate(pos);
+    auto t = network.trace_evaluate(pos, &caches.cache);
 
     ss << " NNUE network contributions "
        << (pos.side_to_move() == WHITE ? "(White to move)" : "(Black to move)") << std::endl
@@ -171,11 +173,14 @@ std::string trace(Position& pos, const Eval::NNUE::Network& network) {
         ss << "|  " << bucket << "        ";
         ss << " |  ";
         format_cp_aligned_dot(t.psqt[bucket], ss, pos);
-        ss << "  " << " |  ";
+        ss << "  "
+           << " |  ";
         format_cp_aligned_dot(t.positional[bucket], ss, pos);
-        ss << "  " << " |  ";
+        ss << "  "
+           << " |  ";
         format_cp_aligned_dot(t.psqt[bucket] + t.positional[bucket], ss, pos);
-        ss << "  " << " |";
+        ss << "  "
+           << " |";
         if (bucket == t.correctBucket)
             ss << " <-- this bucket is used";
         ss << '\n';
diff --git a/src/nnue/nnue_misc.h b/src/nnue/nnue_misc.h
index ce9960a36..e48516f10 100644
--- a/src/nnue/nnue_misc.h
+++ b/src/nnue/nnue_misc.h
@@ -47,12 +47,13 @@ struct NnueEvalTrace {
     std::size_t correctBucket;
 };
 
-
 class Network;
+struct AccumulatorCaches;
 
-
-std::string trace(Position& pos, const Network& network);
-void        hint_common_parent_position(const Position& pos, const Network& network);
+std::string trace(Position& pos, const Network& network, AccumulatorCaches& caches);
+void        hint_common_parent_position(const Position&    pos,
+                                        const Network&     network,
+                                        AccumulatorCaches& caches);
 
 }  // namespace Stockfish::Eval::NNUE
 }  // namespace Stockfish
diff --git a/src/search.cpp b/src/search.cpp
index 6fb9a9632..1a28e5d40 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -33,6 +33,8 @@
 #include "misc.h"
 #include "movegen.h"
 #include "movepick.h"
+#include "nnue/network.h"
+#include "nnue/nnue_accumulator.h"
 #include "nnue/nnue_common.h"
 #include "nnue/nnue_misc.h"
 #include "position.h"
@@ -108,6 +110,7 @@ Search::Worker::Worker(SharedState&                    sharedState,
     // Unpack the SharedState struct into member variables
     thread_idx(thread_id),
     manager(std::move(sm)),
+    refreshTable(),
     options(sharedState.options),
     threads(sharedState.threads),
     tt(sharedState.tt),
@@ -116,6 +119,10 @@ Search::Worker::Worker(SharedState&                    sharedState,
 }
 
 void Search::Worker::start_searching() {
+
+    // Initialize accumulator refresh entries
+    refreshTable.clear(network);
+
     // Non-main threads go directly to iterative_deepening()
     if (!is_mainthread())
     {
@@ -513,7 +520,7 @@ Value Search::Worker::search(
 
         if (threads.stop.load(std::memory_order_relaxed) || ss->ply >= MAX_PLY)
             return (ss->ply >= MAX_PLY && !ss->inCheck)
-                   ? evaluate(network, pos, thisThread->optimism[us])
+                   ? evaluate(network, pos, refreshTable, thisThread->optimism[us])
                    : value_draw(thisThread->nodes);
 
         // Step 3. Mate distance pruning. Even if we mate at the next move our score
@@ -594,7 +601,7 @@ Value Search::Worker::search(
     {
         // Providing the hint that this node's accumulator will be used often
         // brings significant Elo gain (~13 Elo).
-        Eval::NNUE::hint_common_parent_position(pos, network);
+        Eval::NNUE::hint_common_parent_position(pos, network, refreshTable);
         unadjustedStaticEval = eval = ss->staticEval;
     }
     else if (ss->ttHit)
@@ -602,9 +609,9 @@ Value Search::Worker::search(
         // Never assume anything about values stored in TT
         unadjustedStaticEval = tte->eval();
         if (unadjustedStaticEval == VALUE_NONE)
-            unadjustedStaticEval = evaluate(network, pos, thisThread->optimism[us]);
+            unadjustedStaticEval = evaluate(network, pos, refreshTable, thisThread->optimism[us]);
         else if (PvNode)
-            Eval::NNUE::hint_common_parent_position(pos, network);
+            Eval::NNUE::hint_common_parent_position(pos, network, refreshTable);
 
         ss->staticEval = eval = to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos);
 
@@ -614,7 +621,7 @@ Value Search::Worker::search(
     }
     else
     {
-        unadjustedStaticEval = evaluate(network, pos, thisThread->optimism[us]);
+        unadjustedStaticEval = evaluate(network, pos, refreshTable, thisThread->optimism[us]);
         ss->staticEval = eval = to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos);
 
         // Static evaluation is saved as it was before adjustment by correction history
@@ -771,7 +778,7 @@ Value Search::Worker::search(
                 }
             }
 
-        Eval::NNUE::hint_common_parent_position(pos, network);
+        Eval::NNUE::hint_common_parent_position(pos, network, refreshTable);
     }
 
 moves_loop:  // When in check, search starts here
@@ -1320,7 +1327,8 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta,
     }
 
     if (ss->ply >= MAX_PLY)
-        return !ss->inCheck ? evaluate(network, pos, thisThread->optimism[us]) : VALUE_DRAW;
+        return !ss->inCheck ? evaluate(network, pos, refreshTable, thisThread->optimism[us])
+                            : VALUE_DRAW;
 
     assert(0 <= ss->ply && ss->ply < MAX_PLY);
 
@@ -1351,7 +1359,8 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta,
             // Never assume anything about values stored in TT
             unadjustedStaticEval = tte->eval();
             if (unadjustedStaticEval == VALUE_NONE)
-                unadjustedStaticEval = evaluate(network, pos, thisThread->optimism[us]);
+                unadjustedStaticEval =
+                  evaluate(network, pos, refreshTable, thisThread->optimism[us]);
             ss->staticEval = bestValue =
               to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos);
 
@@ -1364,7 +1373,7 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta,
         {
             // In case of null move search, use previous static eval with a different sign
             unadjustedStaticEval = (ss - 1)->currentMove != Move::null()
-                                   ? evaluate(network, pos, thisThread->optimism[us])
+                                   ? evaluate(network, pos, refreshTable, thisThread->optimism[us])
                                    : -(ss - 1)->staticEval;
             ss->staticEval       = bestValue =
               to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos);
diff --git a/src/search.h b/src/search.h
index 28edf6b4e..e09eae7b5 100644
--- a/src/search.h
+++ b/src/search.h
@@ -26,9 +26,9 @@
 #include <cstdint>
 #include <functional>
 #include <memory>
+#include <string>
 #include <string_view>
 #include <vector>
-#include <string>
 
 #include "misc.h"
 #include "movepick.h"
@@ -36,6 +36,7 @@
 #include "score.h"
 #include "timeman.h"
 #include "types.h"
+#include "nnue/nnue_accumulator.h"
 
 namespace Stockfish {
 
@@ -295,6 +296,10 @@ class Worker {
     // The main thread has a SearchManager, the others have a NullSearchManager
     std::unique_ptr<ISearchManager> manager;
 
+    // Used by NNUE
+
+    Eval::NNUE::AccumulatorCaches refreshTable;
+
     const OptionsMap&          options;
     ThreadPool&                threads;
     TranspositionTable&        tt;
diff --git a/src/uci.cpp b/src/uci.cpp
index f13fadad7..4556bf087 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -275,9 +275,9 @@ void UCIEngine::bench(std::istream& args) {
 
     dbg_print();
 
-    std::cerr << "\n===========================" << "\nTotal time (ms) : " << elapsed
-              << "\nNodes searched  : " << nodes << "\nNodes/second    : " << 1000 * nodes / elapsed
-              << std::endl;
+    std::cerr << "\n==========================="
+              << "\nTotal time (ms) : " << elapsed << "\nNodes searched  : " << nodes
+              << "\nNodes/second    : " << 1000 * nodes / elapsed << std::endl;
 
     // reset callback, to not capture a dangling reference to nodesSearched
     engine.set_on_update_full([&](const auto& i) { on_update_full(i, options["UCI_ShowWDL"]); });
@@ -290,7 +290,7 @@ void UCIEngine::setoption(std::istringstream& is) {
 }
 
 std::uint64_t UCIEngine::perft(const Search::LimitsType& limits) {
-    auto nodes = engine.perft(engine.fen(), limits.perft, engine.get_options()["UCI_Chess960"]);
+    auto nodes = engine.perft(engine.fen(), limits.perft);
     sync_cout << "\nNodes searched: " << nodes << "\n" << sync_endl;
     return nodes;
 }