Skip to content

Commit

Permalink
+ entity class LatestReplierRevision that has composite PK `TakenAt…
Browse files Browse the repository at this point in the history
…, Uid` like type 4 of https://en.wikipedia.org/wiki/Slowly_changing_dimension

* now will insert `LatestReplierRevision` entity to store changed uid that might caused by two user swapped their (display)name
* fix exception thrown by `.First()` when no user is matching with the latest replier of parent thread
@ `ThreadLatestReplierSaver.SaveFromUser()`
@ c#/crawler

* rename variables with type `IEnumerable<IGrouping<,>>` to conform naming convention `pluralNoun+GroupBy` like `sKeyBy` for `IDictionary<,>`
@ c#
  • Loading branch information
n0099 committed Jul 11, 2024
1 parent 3644cfe commit 654feca
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 10 deletions.
3 changes: 3 additions & 0 deletions c#/crawler/src/Db/CrawlerDbContext.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ public class CrawlerDbContext(ILogger<CrawlerDbContext> logger, Fid fid = 0)
public Fid Fid { get; } = fid;
public DbSet<User> Users => Set<User>();
public DbSet<LatestReplier> LatestRepliers => Set<LatestReplier>();
public DbSet<LatestReplierRevision> LatestReplierRevisions => Set<LatestReplierRevision>();
public DbSet<AuthorExpGradeRevision> AuthorExpGradeRevisions => Set<AuthorExpGradeRevision>();
public DbSet<ForumModeratorRevision> ForumModeratorRevisions => Set<ForumModeratorRevision>();
public DbSet<ThreadPost> Threads => Set<ThreadPost>();
Expand Down Expand Up @@ -72,6 +73,8 @@ protected override void OnModelCreating(ModelBuilder b)
b.Entity<LatestReplier>().Property(e => e.DisplayName).HasConversion<byte[]>();
b.Entity<LatestReplier>().HasOne<ThreadPost>().WithOne(e => e.LatestReplier)
.HasForeignKey<ThreadPost>(e => e.LatestReplierId);
b.Entity<LatestReplierRevision>().ToTable("tbmcr_user_latestReplier").HasKey(e => new {e.TakenAt, e.Uid});
b.Entity<LatestReplierRevision>().Property(e => e.DisplayName).HasConversion<byte[]>();
b.Entity<ThreadPost>().ToTable($"tbmc_f{Fid}_thread");
b.Entity<ThreadMissingFirstReply>().ToTable("tbmc_thread_missingFirstReply");
b.Entity<ReplyPost>().ToTable($"tbmc_f{Fid}_reply");
Expand Down
11 changes: 11 additions & 0 deletions c#/crawler/src/Db/Revision/LatestReplierRevision.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
// ReSharper disable PropertyCanBeMadeInitOnly.Global
namespace tbm.Crawler.Db.Revision;

public class LatestReplierRevision
{
public uint TakenAt { get; set; }
public uint Id { get; set; }
public long Uid { get; set; }
public string? Name { get; set; }
public string? DisplayName { get; set; }
}
Original file line number Diff line number Diff line change
Expand Up @@ -40,16 +40,29 @@ public Action SaveFromUser(CrawlerDbContext db, Tid tid, IEnumerable<User> users
// possible race: two user swapped their name or displayName
// within the timespan of crawling threads and crawling its (sub)replies
// so the one later crawled is not the original latest replier of thread
var user = users
var matchedUsers = users
.Where(u => u.Name == threadLatestReplier.Name
&& u.DisplayName == threadLatestReplier.DisplayName)
.DistinctBy(u => u.Uid).ToList();
if (user.Count > 1)
Helper.LogDifferentValuesSharingTheSameKeyInEntities(logger, user,
if (matchedUsers.Count == 0) return () => { };
if (matchedUsers.Count > 1)
Helper.LogDifferentValuesSharingTheSameKeyInEntities(logger, matchedUsers,
$"{nameof(User.Name)} and {nameof(User.DisplayName)}",
u => u.Uid, u => (u.Name, u.DisplayName));

threadLatestReplier.Uid = user.First().Uid;
var user = matchedUsers[0];
if (threadLatestReplier.Uid == user.Uid) return () => { };
if (threadLatestReplier.Uid != null)
_ = db.LatestReplierRevisions.Add(new()
{
TakenAt = threadLatestReplier.UpdatedAt ?? threadLatestReplier.CreatedAt,
Id = threadLatestReplier.Id,
Uid = threadLatestReplier.Uid.Value,
Name = threadLatestReplier.Name,
DisplayName = threadLatestReplier.DisplayName
});

threadLatestReplier.Uid = user.Uid;
_ = _saverLocks.Value.Acquire([UniqueLatestReplier.FromLatestReplier(threadLatestReplier)]);
return _saverLocks.Value.Dispose;
}
Expand Down
8 changes: 4 additions & 4 deletions c#/crawler/src/Worker/RetryCrawlWorker.cs
Original file line number Diff line number Diff line change
Expand Up @@ -60,18 +60,18 @@ private async Task RetryThreadLate(
CancellationToken stoppingToken = default)
{
await using var threadLateFacade = threadLateCrawlFacadeFactory();
foreach (var tidGroupByFid in failureCountWithPagesKeyByLockId
foreach (var tidsGroupByFid in failureCountWithPagesKeyByLockId
.Keys.GroupBy(lockId => lockId.Fid, lockId => lockId.Tid))
{
var fid = tidGroupByFid.Key;
var fid = tidsGroupByFid.Key;
FailureCount FailureCountSelector(Tid tid) =>

// it should always contain only one page which is 1
failureCountWithPagesKeyByLockId[new(fid, tid)].Single().Value;
var failureCountsKeyByTid = tidGroupByFid
var failureCountsKeyByTid = tidsGroupByFid
.Cast<Tid>().ToDictionary(tid => tid, FailureCountSelector);
logger.LogTrace("Retrying previous failed thread late crawl with fid={}, threadsId={}",
fid, SharedHelper.UnescapedJsonSerialize(tidGroupByFid));
fid, SharedHelper.UnescapedJsonSerialize(tidsGroupByFid));
await threadLateFacade.Value(fid).CrawlThenSave(failureCountsKeyByTid, stoppingToken);
}
}
Expand Down
4 changes: 2 additions & 2 deletions c#/imagePipeline/src/Ocr/JointRecognizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,8 @@ public IReadOnlyDictionary<ImageKey, string> GetRecognizedTextLines
})
.OrderBy(t => t.alignedY).ThenBy(t => t.X)
.GroupBy(t => t.alignedY, t => t.result)
.Select(groupByLine =>
string.Join('\n', groupByLine.Select(result => result.Text.Trim())));
.Select(resultsGroupByLine =>
string.Join('\n', resultsGroupByLine.Select(result => result.Text.Trim())));
// https://unicode.org/reports/tr15/
return string.Join('\n', resultTextLines).Normalize(NormalizationForm.FormKC);
Expand Down

0 comments on commit 654feca

Please sign in to comment.