diff --git a/src/evaluate.cpp b/src/evaluate.cpp index 1363aa14a..c579a6487 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -25,12 +25,14 @@ #include #include #include +#include #include "nnue/network.h" #include "nnue/nnue_misc.h" #include "position.h" #include "types.h" #include "uci.h" +#include "nnue/nnue_accumulator.h" namespace Stockfish { @@ -46,7 +48,10 @@ int Eval::simple_eval(const Position& pos, Color c) { // Evaluate is the evaluator for the outer world. It returns a static evaluation // of the position from the point of view of the side to move. -Value Eval::evaluate(const Eval::NNUE::Network& network, const Position& pos, int optimism) { +Value Eval::evaluate(const Eval::NNUE::Network& network, + const Position& pos, + NNUE::AccumulatorCaches& caches, + int optimism) { assert(!pos.checkers()); @@ -56,7 +61,7 @@ Value Eval::evaluate(const Eval::NNUE::Network& network, const Position& pos, in int simpleEval = simple_eval(pos, stm); int nnueComplexity; - Value nnue = network.evaluate(pos, true, &nnueComplexity); + Value nnue = network.evaluate(pos, &caches.cache, true, &nnueComplexity); // Blend optimism and eval with nnue complexity and material imbalance optimism += optimism * (nnueComplexity + std::abs(simpleEval - nnue)) / 729; @@ -80,21 +85,24 @@ Value Eval::evaluate(const Eval::NNUE::Network& network, const Position& pos, in // Trace scores are from white's point of view std::string Eval::trace(Position& pos, const Eval::NNUE::Network& network) { + auto caches = std::make_unique(); + caches->clear(network); + if (pos.checkers()) return "Final evaluation: none (in check)"; std::stringstream ss; ss << std::showpoint << std::noshowpos << std::fixed << std::setprecision(2); - ss << '\n' << NNUE::trace(pos, network) << '\n'; + ss << '\n' << NNUE::trace(pos, network, *caches) << '\n'; ss << std::showpoint << std::showpos << std::fixed << std::setprecision(2) << std::setw(15); - Value v = network.evaluate(pos); + Value v = network.evaluate(pos, &caches->cache); v = pos.side_to_move() == WHITE ? v : -v; ss << "NNUE evaluation " << 0.01 * UCIEngine::to_cp(v, pos) << " (white side)\n"; - v = evaluate(network, pos, VALUE_ZERO); + v = evaluate(network, pos, *caches, VALUE_ZERO); v = pos.side_to_move() == WHITE ? v : -v; ss << "Final evaluation " << 0.01 * UCIEngine::to_cp(v, pos) << " (white side)"; ss << " [with scaled NNUE, ...]"; diff --git a/src/evaluate.h b/src/evaluate.h index 60fea5f6d..25b94ca64 100644 --- a/src/evaluate.h +++ b/src/evaluate.h @@ -37,12 +37,16 @@ namespace Eval { namespace NNUE { class Network; +struct AccumulatorCaches; } std::string trace(Position& pos, const Eval::NNUE::Network& network); int simple_eval(const Position& pos, Color c); -Value evaluate(const NNUE::Network& network, const Position& pos, int optimism); +Value evaluate(const NNUE::Network& network, + const Position& pos, + Eval::NNUE::AccumulatorCaches& caches, + int optimism); } // namespace Eval diff --git a/src/nnue/features/half_ka_v2_hm.cpp b/src/nnue/features/half_ka_v2_hm.cpp index f01bc6eff..0a16df1b3 100644 --- a/src/nnue/features/half_ka_v2_hm.cpp +++ b/src/nnue/features/half_ka_v2_hm.cpp @@ -23,7 +23,7 @@ #include "../../bitboard.h" #include "../../position.h" #include "../../types.h" -#include "../nnue_common.h" +#include "../nnue_accumulator.h" namespace Stockfish::Eval::NNUE::Features { @@ -36,22 +36,9 @@ inline IndexType HalfKAv2_hm::make_index(Square s, Piece pc, Square ksq, int ab) + PS_NB * ((KingBuckets[ksq] & 0x7) * 9 + ab)); } -// Get a list of indices for active features -template -void HalfKAv2_hm::append_active_indices(const Position& pos, IndexList& active) { - Square ksq = pos.square(Perspective); - int ab = pos.count(Perspective) * 3 + pos.count(Perspective); - Bitboard bb = pos.pieces(); - while (bb) - { - Square s = pop_lsb(bb); - active.push_back(make_index(s, pos.piece_on(s), ksq, ab)); - } -} - // Explicit template instantiations -template void HalfKAv2_hm::append_active_indices(const Position& pos, IndexList& active); -template void HalfKAv2_hm::append_active_indices(const Position& pos, IndexList& active); +template IndexType HalfKAv2_hm::make_index(Square s, Piece pc, Square ksq, int ab); +template IndexType HalfKAv2_hm::make_index(Square s, Piece pc, Square ksq, int ab); // Get a list of indices for recently changed features template diff --git a/src/nnue/features/half_ka_v2_hm.h b/src/nnue/features/half_ka_v2_hm.h index 23f4e505c..5259ece36 100644 --- a/src/nnue/features/half_ka_v2_hm.h +++ b/src/nnue/features/half_ka_v2_hm.h @@ -64,10 +64,6 @@ class HalfKAv2_hm { }; // clang-format on - // Index of a feature for a given king position and another piece on some square - template - static IndexType make_index(Square s, Piece pc, Square ksq, int ab); - public: // Feature name static constexpr const char* Name = "HalfKAv2_hm"; @@ -95,6 +91,19 @@ class HalfKAv2_hm { }; #undef M + static constexpr uint8_t KingCacheMaps[SQUARE_NB] = { + 0, 0, 0, 0, 1, 2, 0, 0, 0, + 0, 0, 0, 5, 4, 3, 0, 0, 0, + 0, 0, 0, 6, 7, 8, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 6, 7, 8, 0, 0, 0, + 0, 0, 0, 5, 4, 3, 0, 0, 0, + 0, 0, 0, 0, 1, 2, 0, 0, 0, + }; + // Map advisor and bishop location into White King plane static constexpr uint8_t ABMap[SQUARE_NB] = { 0, 0, 0, 1, 0, 2, 5, 0, 0, @@ -133,9 +142,9 @@ class HalfKAv2_hm { static constexpr IndexType MaxActiveDimensions = 32; using IndexList = ValueList; - // Get a list of indices for active features + // Index of a feature for a given king position and another piece on some square template - static void append_active_indices(const Position& pos, IndexList& active); + static IndexType make_index(Square s, Piece pc, Square ksq, int ab); // Get a list of indices for recently changed features template diff --git a/src/nnue/network.cpp b/src/nnue/network.cpp index 5ec42b05f..93dbeaae3 100644 --- a/src/nnue/network.cpp +++ b/src/nnue/network.cpp @@ -126,7 +126,10 @@ bool Network::save(const std::optional& filename) const { } -Value Network::evaluate(const Position& pos, bool adjusted, int* complexity) const { +Value Network::evaluate(const Position& pos, + AccumulatorCaches::Cache* cache, + bool adjusted, + int* complexity) const { // We manually align the arrays on the stack because with gcc < 9.3 // overaligning stack variables with alignas() doesn't work correctly. @@ -145,7 +148,7 @@ Value Network::evaluate(const Position& pos, bool adjusted, int* complexity) con ASSERT_ALIGNED(transformedFeatures, alignment); const int bucket = (pos.count() - 1) / 4; - const auto psqt = featureTransformer->transform(pos, transformedFeatures, bucket); + const auto psqt = featureTransformer->transform(pos, cache, transformedFeatures, bucket); const auto positional = network[bucket]->propagate(transformedFeatures); if (complexity) @@ -188,12 +191,12 @@ void Network::verify(std::string evalfilePath) const { } -void Network::hint_common_access(const Position& pos) const { - featureTransformer->hint_common_access(pos); +void Network::hint_common_access(const Position& pos, AccumulatorCaches::Cache* cache) const { + featureTransformer->hint_common_access(pos, cache); } -NnueEvalTrace Network::trace_evaluate(const Position& pos) const { +NnueEvalTrace Network::trace_evaluate(const Position& pos, AccumulatorCaches::Cache* cache) const { // We manually align the arrays on the stack because with gcc < 9.3 // overaligning stack variables with alignas() doesn't work correctly. constexpr uint64_t alignment = CacheLineSize; @@ -214,8 +217,9 @@ NnueEvalTrace Network::trace_evaluate(const Position& pos) const { t.correctBucket = (pos.count() - 1) / 4; for (IndexType bucket = 0; bucket < LayerStacks; ++bucket) { - const auto materialist = featureTransformer->transform(pos, transformedFeatures, bucket); - const auto positional = network[bucket]->propagate(transformedFeatures); + const auto materialist = + featureTransformer->transform(pos, cache, transformedFeatures, bucket); + const auto positional = network[bucket]->propagate(transformedFeatures); t.psqt[bucket] = static_cast(materialist / OutputScale); t.positional[bucket] = static_cast(positional / OutputScale); diff --git a/src/nnue/network.h b/src/nnue/network.h index 13ecab11c..c489785f1 100644 --- a/src/nnue/network.h +++ b/src/nnue/network.h @@ -26,6 +26,7 @@ #include "nnue_architecture.h" #include "nnue_feature_transformer.h" #include "nnue_misc.h" +#include "nnue_accumulator.h" namespace Stockfish { @@ -42,13 +43,16 @@ class Network { bool save(const std::optional& filename) const; - Value evaluate(const Position& pos, bool adjusted = false, int* complexity = nullptr) const; + Value evaluate(const Position& pos, + AccumulatorCaches::Cache* cache, + bool adjusted = false, + int* complexity = nullptr) const; - void hint_common_access(const Position& pos) const; + void hint_common_access(const Position& pos, AccumulatorCaches::Cache* cache) const; void verify(std::string evalfilePath) const; - NnueEvalTrace trace_evaluate(const Position& pos) const; + NnueEvalTrace trace_evaluate(const Position& pos, AccumulatorCaches::Cache* cache) const; private: void load_user_net(const std::string&, const std::string&); @@ -75,6 +79,8 @@ class Network { // Hash value of evaluation function structure static constexpr std::uint32_t hash = FeatureTransformer::get_hash_value() ^ NetworkArchitecture::get_hash_value(); + + friend struct AccumulatorCaches::Cache; }; } // namespace Stockfish::Eval::NNUE diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h index f6d705243..f3ab1a0a8 100644 --- a/src/nnue/nnue_accumulator.h +++ b/src/nnue/nnue_accumulator.h @@ -28,11 +28,67 @@ namespace Stockfish::Eval::NNUE { +using BiasType = std::int16_t; +using PSQTWeightType = std::int32_t; +using IndexType = std::uint32_t; + // Class that holds the result of affine transformation of input features struct alignas(CacheLineSize) Accumulator { - std::int16_t accumulation[2][TransformedFeatureDimensions]; - std::int32_t psqtAccumulation[2][PSQTBuckets]; - bool computed[2]; + std::int16_t accumulation[COLOR_NB][TransformedFeatureDimensions]; + std::int32_t psqtAccumulation[COLOR_NB][PSQTBuckets]; + bool computed[COLOR_NB]; +}; + + +// AccumulatorCaches struct provides per-thread accumulator caches, where each +// cache contains multiple entries for each of the possible king squares. +// When the accumulator needs to be refreshed, the cached entry is used to more +// efficiently update the accumulator, instead of rebuilding it from scratch. +// This idea, was first described by Luecx (author of Koivisto) and +// is commonly referred to as "Finny Tables". +struct AccumulatorCaches { + + struct alignas(CacheLineSize) Cache { + + struct alignas(CacheLineSize) Entry { + BiasType accumulation[COLOR_NB][TransformedFeatureDimensions]; + PSQTWeightType psqtAccumulation[COLOR_NB][PSQTBuckets]; + Bitboard byColorBB[COLOR_NB][COLOR_NB]; + Bitboard byTypeBB[COLOR_NB][PIECE_TYPE_NB]; + + // To initialize a refresh entry, we set all its bitboards empty, + // so we put the biases in the accumulation, without any weights on top + void clear(const BiasType* biases) { + + std::memset(byColorBB, 0, sizeof(byColorBB)); + std::memset(byTypeBB, 0, sizeof(byTypeBB)); + + std::memcpy(accumulation[WHITE], biases, + TransformedFeatureDimensions * sizeof(BiasType)); + std::memcpy(accumulation[BLACK], biases, + TransformedFeatureDimensions * sizeof(BiasType)); + + std::memset(psqtAccumulation, 0, sizeof(psqtAccumulation)); + } + }; + + template + void clear(const Network& network) { + for (auto& entry : entries) + entry.clear(network.featureTransformer->biases); + } + + Entry& operator[](int index) { return entries[index]; } + + std::array entries; + }; + + template + void clear(const Network& network) { + cache.clear(network); + } + + Cache cache; }; } // namespace Stockfish::Eval::NNUE diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index c7381137f..1331766fa 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -232,10 +232,10 @@ static constexpr int NumPsqtRegs = // Input feature converter class FeatureTransformer { - private: // Number of output dimensions for one side static constexpr IndexType HalfDimensions = TransformedFeatureDimensions; + private: #ifdef VECTOR static constexpr IndexType TileHeight = NumRegs * sizeof(vec_t) / 2; static constexpr IndexType PsqtTileHeight = NumPsqtRegs * sizeof(psqt_vec_t) / 4; @@ -334,9 +334,12 @@ class FeatureTransformer { } // Convert input features - std::int32_t transform(const Position& pos, OutputType* output, int bucket) const { - update_accumulator(pos); - update_accumulator(pos); + std::int32_t transform(const Position& pos, + AccumulatorCaches::Cache* cache, + OutputType* output, + int bucket) const { + update_accumulator(pos, cache); + update_accumulator(pos, cache); const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()}; const auto& accumulation = pos.state()->accumulator.accumulation; @@ -396,9 +399,9 @@ class FeatureTransformer { return psqt; } // end of function transform() - void hint_common_access(const Position& pos) const { - hint_common_access_for_perspective(pos); - hint_common_access_for_perspective(pos); + void hint_common_access(const Position& pos, AccumulatorCaches::Cache* cache) const { + hint_common_access_for_perspective(pos, cache); + hint_common_access_for_perspective(pos, cache); } private: @@ -662,116 +665,148 @@ class FeatureTransformer { } template - void update_accumulator_refresh(const Position& pos) const { -#ifdef VECTOR - // Gcc-10.2 unnecessarily spills AVX2 registers if this array - // is defined in the VECTOR code below, once in each branch - vec_t acc[NumRegs]; - psqt_vec_t psqt[NumPsqtRegs]; -#endif + void update_accumulator_refresh(const Position& pos, AccumulatorCaches::Cache* cache) const { + assert(cache != nullptr); + + const Square ksq = pos.square(Perspective); + const int ab = pos.count(Perspective) * 3 + pos.count(Perspective); + + auto& entry = (*cache)[FeatureSet::KingCacheMaps[ksq] * 9 + ab]; - // Refresh the accumulator - // Could be extracted to a separate function because it's done in 2 places, - // but it's unclear if compilers would correctly handle register allocation. auto& accumulator = pos.state()->accumulator; accumulator.computed[Perspective] = true; - FeatureSet::IndexList active; - FeatureSet::append_active_indices(pos, active); + + FeatureSet::IndexList removed, added; + for (Color c : {WHITE, BLACK}) + { + for (PieceType pt = ROOK; pt <= KING; ++pt) + { + const Piece piece = make_piece(c, pt); + const Bitboard oldBB = + entry.byColorBB[Perspective][c] & entry.byTypeBB[Perspective][pt]; + const Bitboard newBB = pos.pieces(c, pt); + Bitboard toRemove = oldBB & ~newBB; + Bitboard toAdd = newBB & ~oldBB; + + while (toRemove) + { + Square sq = pop_lsb(toRemove); + removed.push_back(FeatureSet::make_index(sq, piece, ksq, ab)); + } + while (toAdd) + { + Square sq = pop_lsb(toAdd); + added.push_back(FeatureSet::make_index(sq, piece, ksq, ab)); + } + } + } #ifdef VECTOR + vec_t acc[NumRegs]; + psqt_vec_t psqt[NumPsqtRegs]; + for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j) { - auto biasesTile = reinterpret_cast(&biases[j * TileHeight]); + auto entryTile = + reinterpret_cast(&entry.accumulation[Perspective][j * TileHeight]); for (IndexType k = 0; k < NumRegs; ++k) - acc[k] = biasesTile[k]; + acc[k] = entryTile[k]; - int i = 0; - for (; i < int(active.size()) - 1; i += 2) + for (int i = 0; i < int(added.size()); ++i) { - IndexType index0 = active[i]; - IndexType index1 = active[i + 1]; - const IndexType offset0 = HalfDimensions * index0 + j * TileHeight; - const IndexType offset1 = HalfDimensions * index1 + j * TileHeight; - auto column0 = reinterpret_cast(&weights[offset0]); - auto column1 = reinterpret_cast(&weights[offset1]); + IndexType index = added[i]; + const IndexType offset = HalfDimensions * index + j * TileHeight; + auto column = reinterpret_cast(&weights[offset]); for (unsigned k = 0; k < NumRegs; ++k) - acc[k] = vec_add_16(acc[k], vec_add_16(column0[k], column1[k])); + acc[k] = vec_add_16(acc[k], column[k]); } - for (; i < int(active.size()); ++i) + for (int i = 0; i < int(removed.size()); ++i) { - IndexType index = active[i]; + IndexType index = removed[i]; const IndexType offset = HalfDimensions * index + j * TileHeight; auto column = reinterpret_cast(&weights[offset]); for (unsigned k = 0; k < NumRegs; ++k) - acc[k] = vec_add_16(acc[k], column[k]); + acc[k] = vec_sub_16(acc[k], column[k]); } - auto accTile = - reinterpret_cast(&accumulator.accumulation[Perspective][j * TileHeight]); - for (unsigned k = 0; k < NumRegs; k++) - vec_store(&accTile[k], acc[k]); + for (IndexType k = 0; k < NumRegs; k++) + vec_store(&entryTile[k], acc[k]); } for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j) { + auto entryTilePsqt = reinterpret_cast( + &entry.psqtAccumulation[Perspective][j * PsqtTileHeight]); for (std::size_t k = 0; k < NumPsqtRegs; ++k) - psqt[k] = vec_zero_psqt(); + psqt[k] = entryTilePsqt[k]; - int i = 0; - for (; i < int(active.size()) - 1; i += 2) + for (int i = 0; i < int(added.size()); ++i) { - IndexType index0 = active[i]; - IndexType index1 = active[i + 1]; - const IndexType offset0 = PSQTBuckets * index0 + j * PsqtTileHeight; - const IndexType offset1 = PSQTBuckets * index1 + j * PsqtTileHeight; - auto columnPsqt0 = reinterpret_cast(&psqtWeights[offset0]); - auto columnPsqt1 = reinterpret_cast(&psqtWeights[offset1]); + IndexType index = added[i]; + const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight; + auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); for (std::size_t k = 0; k < NumPsqtRegs; ++k) - psqt[k] = - vec_add_psqt_32(psqt[k], vec_add_psqt_32(columnPsqt0[k], columnPsqt1[k])); + psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]); } - for (; i < int(active.size()); ++i) + for (int i = 0; i < int(removed.size()); ++i) { - IndexType index = active[i]; + IndexType index = removed[i]; const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight; auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); for (std::size_t k = 0; k < NumPsqtRegs; ++k) - psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]); + psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]); } - auto accTilePsqt = reinterpret_cast( - &accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]); for (std::size_t k = 0; k < NumPsqtRegs; ++k) - vec_store_psqt(&accTilePsqt[k], psqt[k]); + vec_store_psqt(&entryTilePsqt[k], psqt[k]); } #else - std::memcpy(accumulator.accumulation[Perspective], biases, - HalfDimensions * sizeof(BiasType)); - for (std::size_t k = 0; k < PSQTBuckets; ++k) - accumulator.psqtAccumulation[Perspective][k] = 0; - - for (const auto index : active) + for (const auto index : added) { const IndexType offset = HalfDimensions * index; + for (IndexType j = 0; j < HalfDimensions; ++j) + entry.accumulation[Perspective][j] += weights[offset + j]; + for (std::size_t k = 0; k < PSQTBuckets; ++k) + entry.psqtAccumulation[Perspective][k] += psqtWeights[index * PSQTBuckets + k]; + } + for (const auto index : removed) + { + const IndexType offset = HalfDimensions * index; for (IndexType j = 0; j < HalfDimensions; ++j) - accumulator.accumulation[Perspective][j] += weights[offset + j]; + entry.accumulation[Perspective][j] -= weights[offset + j]; for (std::size_t k = 0; k < PSQTBuckets; ++k) - accumulator.psqtAccumulation[Perspective][k] += - psqtWeights[index * PSQTBuckets + k]; + entry.psqtAccumulation[Perspective][k] -= psqtWeights[index * PSQTBuckets + k]; } + #endif + + // The accumulator of the refresh entry has been updated. + // Now copy its content to the actual accumulator we were refreshing + + std::memcpy(accumulator.psqtAccumulation[Perspective], entry.psqtAccumulation[Perspective], + sizeof(int32_t) * PSQTBuckets); + + std::memcpy(accumulator.accumulation[Perspective], entry.accumulation[Perspective], + sizeof(BiasType) * HalfDimensions); + + for (Color c : {WHITE, BLACK}) + entry.byColorBB[Perspective][c] = pos.pieces(c); + + for (PieceType pt = ROOK; pt <= KING; ++pt) + entry.byTypeBB[Perspective][pt] = pos.pieces(pt); } template - void hint_common_access_for_perspective(const Position& pos) const { + void hint_common_access_for_perspective(const Position& pos, + AccumulatorCaches::Cache* cache) const { // Works like update_accumulator, but performs less work. // Updates ONLY the accumulator for pos. @@ -791,11 +826,11 @@ class FeatureTransformer { update_accumulator_incremental(pos, oldest_st, states_to_update); } else - update_accumulator_refresh(pos); + update_accumulator_refresh(pos, cache); } template - void update_accumulator(const Position& pos) const { + void update_accumulator(const Position& pos, AccumulatorCaches::Cache* cache) const { auto [oldest_st, next] = try_find_computed_accumulator(pos); @@ -816,10 +851,12 @@ class FeatureTransformer { } else { - update_accumulator_refresh(pos); + update_accumulator_refresh(pos, cache); } } + friend struct AccumulatorCaches::Cache; + alignas(CacheLineSize) BiasType biases[HalfDimensions]; alignas(CacheLineSize) WeightType weights[HalfDimensions * InputDimensions]; alignas(CacheLineSize) PSQTWeightType psqtWeights[InputDimensions * PSQTBuckets]; diff --git a/src/nnue/nnue_misc.cpp b/src/nnue/nnue_misc.cpp index 8da47e9c3..762fe49e5 100644 --- a/src/nnue/nnue_misc.cpp +++ b/src/nnue/nnue_misc.cpp @@ -38,9 +38,11 @@ namespace Stockfish::Eval::NNUE { constexpr std::string_view PieceToChar(" RACPNBK racpnbk"); -void hint_common_parent_position(const Position& pos, const Network& network) { +void hint_common_parent_position(const Position& pos, + const Network& network, + AccumulatorCaches& caches) { - network.hint_common_access(pos); + network.hint_common_access(pos, &caches.cache); } namespace { @@ -96,7 +98,7 @@ void format_cp_aligned_dot(Value v, std::stringstream& stream, const Position& p // Returns a string with the value of each piece on a board, // and a table for (PSQT, Layers) values bucket by bucket. -std::string trace(Position& pos, const Eval::NNUE::Network& network) { +std::string trace(Position& pos, const Eval::NNUE::Network& network, AccumulatorCaches& caches) { std::stringstream ss; @@ -122,7 +124,7 @@ std::string trace(Position& pos, const Eval::NNUE::Network& network) { // We estimate the value of each piece by doing a differential evaluation from // the current base eval, simulating the removal of the piece from its square. - Value base = network.evaluate(pos); + Value base = network.evaluate(pos, &caches.cache); base = pos.side_to_move() == WHITE ? base : -base; for (File f = FILE_A; f <= FILE_I; ++f) @@ -140,7 +142,7 @@ std::string trace(Position& pos, const Eval::NNUE::Network& network) { st->accumulator.computed[WHITE] = false; st->accumulator.computed[BLACK] = false; - Value eval = network.evaluate(pos); + Value eval = network.evaluate(pos, &caches.cache); eval = pos.side_to_move() == WHITE ? eval : -eval; v = base - eval; @@ -157,7 +159,7 @@ std::string trace(Position& pos, const Eval::NNUE::Network& network) { ss << board[row] << '\n'; ss << '\n'; - auto t = network.trace_evaluate(pos); + auto t = network.trace_evaluate(pos, &caches.cache); ss << " NNUE network contributions " << (pos.side_to_move() == WHITE ? "(White to move)" : "(Black to move)") << std::endl @@ -171,11 +173,14 @@ std::string trace(Position& pos, const Eval::NNUE::Network& network) { ss << "| " << bucket << " "; ss << " | "; format_cp_aligned_dot(t.psqt[bucket], ss, pos); - ss << " " << " | "; + ss << " " + << " | "; format_cp_aligned_dot(t.positional[bucket], ss, pos); - ss << " " << " | "; + ss << " " + << " | "; format_cp_aligned_dot(t.psqt[bucket] + t.positional[bucket], ss, pos); - ss << " " << " |"; + ss << " " + << " |"; if (bucket == t.correctBucket) ss << " <-- this bucket is used"; ss << '\n'; diff --git a/src/nnue/nnue_misc.h b/src/nnue/nnue_misc.h index ce9960a36..e48516f10 100644 --- a/src/nnue/nnue_misc.h +++ b/src/nnue/nnue_misc.h @@ -47,12 +47,13 @@ struct NnueEvalTrace { std::size_t correctBucket; }; - class Network; +struct AccumulatorCaches; - -std::string trace(Position& pos, const Network& network); -void hint_common_parent_position(const Position& pos, const Network& network); +std::string trace(Position& pos, const Network& network, AccumulatorCaches& caches); +void hint_common_parent_position(const Position& pos, + const Network& network, + AccumulatorCaches& caches); } // namespace Stockfish::Eval::NNUE } // namespace Stockfish diff --git a/src/search.cpp b/src/search.cpp index 6fb9a9632..1a28e5d40 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -33,6 +33,8 @@ #include "misc.h" #include "movegen.h" #include "movepick.h" +#include "nnue/network.h" +#include "nnue/nnue_accumulator.h" #include "nnue/nnue_common.h" #include "nnue/nnue_misc.h" #include "position.h" @@ -108,6 +110,7 @@ Search::Worker::Worker(SharedState& sharedState, // Unpack the SharedState struct into member variables thread_idx(thread_id), manager(std::move(sm)), + refreshTable(), options(sharedState.options), threads(sharedState.threads), tt(sharedState.tt), @@ -116,6 +119,10 @@ Search::Worker::Worker(SharedState& sharedState, } void Search::Worker::start_searching() { + + // Initialize accumulator refresh entries + refreshTable.clear(network); + // Non-main threads go directly to iterative_deepening() if (!is_mainthread()) { @@ -513,7 +520,7 @@ Value Search::Worker::search( if (threads.stop.load(std::memory_order_relaxed) || ss->ply >= MAX_PLY) return (ss->ply >= MAX_PLY && !ss->inCheck) - ? evaluate(network, pos, thisThread->optimism[us]) + ? evaluate(network, pos, refreshTable, thisThread->optimism[us]) : value_draw(thisThread->nodes); // Step 3. Mate distance pruning. Even if we mate at the next move our score @@ -594,7 +601,7 @@ Value Search::Worker::search( { // Providing the hint that this node's accumulator will be used often // brings significant Elo gain (~13 Elo). - Eval::NNUE::hint_common_parent_position(pos, network); + Eval::NNUE::hint_common_parent_position(pos, network, refreshTable); unadjustedStaticEval = eval = ss->staticEval; } else if (ss->ttHit) @@ -602,9 +609,9 @@ Value Search::Worker::search( // Never assume anything about values stored in TT unadjustedStaticEval = tte->eval(); if (unadjustedStaticEval == VALUE_NONE) - unadjustedStaticEval = evaluate(network, pos, thisThread->optimism[us]); + unadjustedStaticEval = evaluate(network, pos, refreshTable, thisThread->optimism[us]); else if (PvNode) - Eval::NNUE::hint_common_parent_position(pos, network); + Eval::NNUE::hint_common_parent_position(pos, network, refreshTable); ss->staticEval = eval = to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos); @@ -614,7 +621,7 @@ Value Search::Worker::search( } else { - unadjustedStaticEval = evaluate(network, pos, thisThread->optimism[us]); + unadjustedStaticEval = evaluate(network, pos, refreshTable, thisThread->optimism[us]); ss->staticEval = eval = to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos); // Static evaluation is saved as it was before adjustment by correction history @@ -771,7 +778,7 @@ Value Search::Worker::search( } } - Eval::NNUE::hint_common_parent_position(pos, network); + Eval::NNUE::hint_common_parent_position(pos, network, refreshTable); } moves_loop: // When in check, search starts here @@ -1320,7 +1327,8 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta, } if (ss->ply >= MAX_PLY) - return !ss->inCheck ? evaluate(network, pos, thisThread->optimism[us]) : VALUE_DRAW; + return !ss->inCheck ? evaluate(network, pos, refreshTable, thisThread->optimism[us]) + : VALUE_DRAW; assert(0 <= ss->ply && ss->ply < MAX_PLY); @@ -1351,7 +1359,8 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta, // Never assume anything about values stored in TT unadjustedStaticEval = tte->eval(); if (unadjustedStaticEval == VALUE_NONE) - unadjustedStaticEval = evaluate(network, pos, thisThread->optimism[us]); + unadjustedStaticEval = + evaluate(network, pos, refreshTable, thisThread->optimism[us]); ss->staticEval = bestValue = to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos); @@ -1364,7 +1373,7 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta, { // In case of null move search, use previous static eval with a different sign unadjustedStaticEval = (ss - 1)->currentMove != Move::null() - ? evaluate(network, pos, thisThread->optimism[us]) + ? evaluate(network, pos, refreshTable, thisThread->optimism[us]) : -(ss - 1)->staticEval; ss->staticEval = bestValue = to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos); diff --git a/src/search.h b/src/search.h index 28edf6b4e..e09eae7b5 100644 --- a/src/search.h +++ b/src/search.h @@ -26,9 +26,9 @@ #include #include #include +#include #include #include -#include #include "misc.h" #include "movepick.h" @@ -36,6 +36,7 @@ #include "score.h" #include "timeman.h" #include "types.h" +#include "nnue/nnue_accumulator.h" namespace Stockfish { @@ -295,6 +296,10 @@ class Worker { // The main thread has a SearchManager, the others have a NullSearchManager std::unique_ptr manager; + // Used by NNUE + + Eval::NNUE::AccumulatorCaches refreshTable; + const OptionsMap& options; ThreadPool& threads; TranspositionTable& tt; diff --git a/src/uci.cpp b/src/uci.cpp index f13fadad7..4556bf087 100644 --- a/src/uci.cpp +++ b/src/uci.cpp @@ -275,9 +275,9 @@ void UCIEngine::bench(std::istream& args) { dbg_print(); - std::cerr << "\n===========================" << "\nTotal time (ms) : " << elapsed - << "\nNodes searched : " << nodes << "\nNodes/second : " << 1000 * nodes / elapsed - << std::endl; + std::cerr << "\n===========================" + << "\nTotal time (ms) : " << elapsed << "\nNodes searched : " << nodes + << "\nNodes/second : " << 1000 * nodes / elapsed << std::endl; // reset callback, to not capture a dangling reference to nodesSearched engine.set_on_update_full([&](const auto& i) { on_update_full(i, options["UCI_ShowWDL"]); }); @@ -290,7 +290,7 @@ void UCIEngine::setoption(std::istringstream& is) { } std::uint64_t UCIEngine::perft(const Search::LimitsType& limits) { - auto nodes = engine.perft(engine.fen(), limits.perft, engine.get_options()["UCI_Chess960"]); + auto nodes = engine.perft(engine.fen(), limits.perft); sync_cout << "\nNodes searched: " << nodes << "\n" << sync_endl; return nodes; }