diff --git a/c#/crawler/src/Db/CrawlerDbContext.cs b/c#/crawler/src/Db/CrawlerDbContext.cs index c8ba14f1..5025318c 100644 --- a/c#/crawler/src/Db/CrawlerDbContext.cs +++ b/c#/crawler/src/Db/CrawlerDbContext.cs @@ -16,6 +16,7 @@ public class CrawlerDbContext(ILogger logger, Fid fid = 0) public Fid Fid { get; } = fid; public DbSet Users => Set(); public DbSet LatestRepliers => Set(); + public DbSet LatestReplierRevisions => Set(); public DbSet AuthorExpGradeRevisions => Set(); public DbSet ForumModeratorRevisions => Set(); public DbSet Threads => Set(); @@ -72,6 +73,8 @@ protected override void OnModelCreating(ModelBuilder b) b.Entity().Property(e => e.DisplayName).HasConversion(); b.Entity().HasOne().WithOne(e => e.LatestReplier) .HasForeignKey(e => e.LatestReplierId); + b.Entity().ToTable("tbmcr_user_latestReplier").HasKey(e => new {e.TakenAt, e.Uid}); + b.Entity().Property(e => e.DisplayName).HasConversion(); b.Entity().ToTable($"tbmc_f{Fid}_thread"); b.Entity().ToTable("tbmc_thread_missingFirstReply"); b.Entity().ToTable($"tbmc_f{Fid}_reply"); diff --git a/c#/crawler/src/Db/Revision/LatestReplierRevision.cs b/c#/crawler/src/Db/Revision/LatestReplierRevision.cs new file mode 100644 index 00000000..173e4f6c --- /dev/null +++ b/c#/crawler/src/Db/Revision/LatestReplierRevision.cs @@ -0,0 +1,11 @@ +// ReSharper disable PropertyCanBeMadeInitOnly.Global +namespace tbm.Crawler.Db.Revision; + +public class LatestReplierRevision +{ + public uint TakenAt { get; set; } + public uint Id { get; set; } + public long Uid { get; set; } + public string? Name { get; set; } + public string? DisplayName { get; set; } +} diff --git a/c#/crawler/src/Tieba/Crawl/Saver/Related/ThreadLatestReplierSaver.cs b/c#/crawler/src/Tieba/Crawl/Saver/Related/ThreadLatestReplierSaver.cs index 921e637f..2dbf598e 100644 --- a/c#/crawler/src/Tieba/Crawl/Saver/Related/ThreadLatestReplierSaver.cs +++ b/c#/crawler/src/Tieba/Crawl/Saver/Related/ThreadLatestReplierSaver.cs @@ -40,16 +40,29 @@ public Action SaveFromUser(CrawlerDbContext db, Tid tid, IEnumerable users // possible race: two user swapped their name or displayName // within the timespan of crawling threads and crawling its (sub)replies // so the one later crawled is not the original latest replier of thread - var user = users + var matchedUsers = users .Where(u => u.Name == threadLatestReplier.Name && u.DisplayName == threadLatestReplier.DisplayName) .DistinctBy(u => u.Uid).ToList(); - if (user.Count > 1) - Helper.LogDifferentValuesSharingTheSameKeyInEntities(logger, user, + if (matchedUsers.Count == 0) return () => { }; + if (matchedUsers.Count > 1) + Helper.LogDifferentValuesSharingTheSameKeyInEntities(logger, matchedUsers, $"{nameof(User.Name)} and {nameof(User.DisplayName)}", u => u.Uid, u => (u.Name, u.DisplayName)); - threadLatestReplier.Uid = user.First().Uid; + var user = matchedUsers[0]; + if (threadLatestReplier.Uid == user.Uid) return () => { }; + if (threadLatestReplier.Uid != null) + _ = db.LatestReplierRevisions.Add(new() + { + TakenAt = threadLatestReplier.UpdatedAt ?? threadLatestReplier.CreatedAt, + Id = threadLatestReplier.Id, + Uid = threadLatestReplier.Uid.Value, + Name = threadLatestReplier.Name, + DisplayName = threadLatestReplier.DisplayName + }); + + threadLatestReplier.Uid = user.Uid; _ = _saverLocks.Value.Acquire([UniqueLatestReplier.FromLatestReplier(threadLatestReplier)]); return _saverLocks.Value.Dispose; } diff --git a/c#/crawler/src/Worker/RetryCrawlWorker.cs b/c#/crawler/src/Worker/RetryCrawlWorker.cs index 73f51079..d2a23be4 100644 --- a/c#/crawler/src/Worker/RetryCrawlWorker.cs +++ b/c#/crawler/src/Worker/RetryCrawlWorker.cs @@ -60,18 +60,18 @@ private async Task RetryThreadLate( CancellationToken stoppingToken = default) { await using var threadLateFacade = threadLateCrawlFacadeFactory(); - foreach (var tidGroupByFid in failureCountWithPagesKeyByLockId + foreach (var tidsGroupByFid in failureCountWithPagesKeyByLockId .Keys.GroupBy(lockId => lockId.Fid, lockId => lockId.Tid)) { - var fid = tidGroupByFid.Key; + var fid = tidsGroupByFid.Key; FailureCount FailureCountSelector(Tid tid) => // it should always contain only one page which is 1 failureCountWithPagesKeyByLockId[new(fid, tid)].Single().Value; - var failureCountsKeyByTid = tidGroupByFid + var failureCountsKeyByTid = tidsGroupByFid .Cast().ToDictionary(tid => tid, FailureCountSelector); logger.LogTrace("Retrying previous failed thread late crawl with fid={}, threadsId={}", - fid, SharedHelper.UnescapedJsonSerialize(tidGroupByFid)); + fid, SharedHelper.UnescapedJsonSerialize(tidsGroupByFid)); await threadLateFacade.Value(fid).CrawlThenSave(failureCountsKeyByTid, stoppingToken); } } diff --git a/c#/imagePipeline/src/Ocr/JointRecognizer.cs b/c#/imagePipeline/src/Ocr/JointRecognizer.cs index 570f627d..6d60a7bf 100644 --- a/c#/imagePipeline/src/Ocr/JointRecognizer.cs +++ b/c#/imagePipeline/src/Ocr/JointRecognizer.cs @@ -88,8 +88,8 @@ public IReadOnlyDictionary GetRecognizedTextLines }) .OrderBy(t => t.alignedY).ThenBy(t => t.X) .GroupBy(t => t.alignedY, t => t.result) - .Select(groupByLine => - string.Join('\n', groupByLine.Select(result => result.Text.Trim()))); + .Select(resultsGroupByLine => + string.Join('\n', resultsGroupByLine.Select(result => result.Text.Trim()))); // https://unicode.org/reports/tr15/ return string.Join('\n', resultTextLines).Normalize(NormalizationForm.FormKC);