Skip to content

Commit

Permalink
* partial revert f2388a7 @ ThreadCrawlFacade.cs & `ThreadLatestReplie…
Browse files Browse the repository at this point in the history
…rSaver.SaveFromThread()`

* fix `EntityEntry.CurrentValues.SetValues()` won't update navigation prop @ `SaverWithRevision.SaveEntitiesWithRevision()`
* add primary ctor param `existingAfterInTracking` to entities `Existing[].After` & `AllAfter` is the one being tracked by DbContext instead of the one from `PostSaver.Posts` that produced by `PostParser.Convert()` @ SaverChangeSet.cs
* remove part of `user_` in the name of table for entity classes `LatestReplier(Revision)?`
@ c#/crawler
  • Loading branch information
n0099 committed Jul 12, 2024
1 parent f2388a7 commit b81b64a
Show file tree
Hide file tree
Showing 7 changed files with 41 additions and 37 deletions.
4 changes: 2 additions & 2 deletions c#/crawler/src/Db/CrawlerDbContext.cs
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,11 @@ protected override void OnModelCreating(ModelBuilder b)
base.OnModelCreating(b);
OnModelCreatingWithFid(b, Fid);
b.Entity<User>().ToTable("tbmc_user");
b.Entity<LatestReplier>().ToTable("tbmc_user_latestReplier");
b.Entity<LatestReplier>().ToTable("tbmc_latestReplier");
b.Entity<LatestReplier>().Property(e => e.DisplayName).HasConversion<byte[]>();
b.Entity<LatestReplier>().HasOne<ThreadPost>().WithOne(e => e.LatestReplier)
.HasForeignKey<ThreadPost>(e => e.LatestReplierId);
b.Entity<LatestReplierRevision>().ToTable("tbmcr_user_latestReplier").HasKey(e => new {e.TakenAt, e.Uid});
b.Entity<LatestReplierRevision>().ToTable("tbmcr_latestReplier").HasKey(e => new {e.TakenAt, e.Uid});
b.Entity<LatestReplierRevision>().Property(e => e.DisplayName).HasConversion<byte[]>();
b.Entity<ThreadPost>().ToTable($"tbmc_f{Fid}_thread");
b.Entity<ThreadMissingFirstReply>().ToTable("tbmc_thread_missingFirstReply");
Expand Down
14 changes: 10 additions & 4 deletions c#/crawler/src/Tieba/Crawl/Facade/ThreadCrawlFacade.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ public class ThreadCrawlFacade(
postParser, postSaverFactory.Invoke,
userParserFactory.Invoke, userSaverFactory.Invoke)
{
private readonly Dictionary<ThreadLatestReplierSaver.UniqueLatestReplier, LatestReplier?> _latestRepliersKeyByUnique = [];

public delegate ThreadCrawlFacade New(Fid fid, string forumName);

protected override void OnPostParse(
Expand Down Expand Up @@ -42,17 +44,21 @@ join parsed in Posts.Values on (Tid)inResponse.Tid equals parsed.Tid
{ // replace with more detailed location.name in the 6.0.2 response
t.parsed.Geolocation = Helper.SerializedProtoBufOrNullIfEmpty(t.inResponse.Location);
}
var lastReplyer = t.inResponse.LastReplyer;
var name = lastReplyer?.Name.NullIfEmpty();
var nameShow = lastReplyer?.NameShow.NullIfEmpty();
var name = t.inResponse.LastReplyer.Name.NullIfEmpty();
var nameShow = t.inResponse.LastReplyer.NameShow.NullIfEmpty();
// LastReplyer will be null when LivePostType != "", but LastTimeInt will have expected timestamp value
t.parsed.LatestReplier = lastReplyer == null ? null : new LatestReplier
var latestReplierEntity = t.inResponse.LastReplyer == null ? null : new LatestReplier
{
Name = name,
#pragma warning disable S3358 // Ternary operators should not be nested
DisplayName = name == nameShow ? null : nameShow
#pragma warning restore S3358 // Ternary operators should not be nested
};
var uniqueLatestReplier = ThreadLatestReplierSaver.UniqueLatestReplier.FromLatestReplier(latestReplierEntity);
var isExists = _latestRepliersKeyByUnique.TryGetValue(uniqueLatestReplier, out var existingLatestReplier);
if (!isExists) _latestRepliersKeyByUnique[uniqueLatestReplier] = latestReplierEntity;
t.parsed.LatestReplier = isExists ? existingLatestReplier : latestReplierEntity;
});
}
6 changes: 3 additions & 3 deletions c#/crawler/src/Tieba/Crawl/Saver/Post/PostSaver.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,12 @@ protected SaverChangeSet<TPost> Save<TRevision>(
var existingPostsKeyById = db.Set<TPost>().AsTracking()
.Where(existingPostPredicate).ToDictionary(postIdSelector);

// deep copy before entities get mutated by SaverWithRevision.SaveEntitiesWithRevision()
var existingBeforeMerge = existingPostsKeyById.Select(pair => (TPost)pair.Value.Clone()).ToList();
// clone before entities get mutated by SaverWithRevision.SaveEntitiesWithRevision()
var existingPostsBeforeMerge = existingPostsKeyById.Select(pair => (TPost)pair.Value.Clone()).ToList();

SaveEntitiesWithRevision(db, revisionFactory,
Posts.Values.ToLookup(p => existingPostsKeyById.ContainsKey(postIdSelector(p))),
p => existingPostsKeyById[postIdSelector(p)]);
return new(existingBeforeMerge, Posts.Values, postIdSelector);
return new(postIdSelector, existingPostsBeforeMerge, Posts.Values, existingPostsKeyById.Values);
}
}
2 changes: 1 addition & 1 deletion c#/crawler/src/Tieba/Crawl/Saver/Post/ThreadSaver.cs
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ protected override bool FieldUpdateIgnorance
nameof(ThreadPost.Title)
when newValue is ""

// prevent repeatedly update with different title
// prevent update repeatedly with different title
// due to the thread is a multi forum topic thread
// thus its title can be varied within the forum and within the thread
|| (newValue is not "" && oldValue is not "") => true,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
using Microsoft.EntityFrameworkCore.ChangeTracking;

namespace tbm.Crawler.Tieba.Crawl.Saver.Related;

public class ThreadLatestReplierSaver(
Expand All @@ -12,27 +10,10 @@ public class ThreadLatestReplierSaver(

public Action SaveFromThread(CrawlerDbContext db, IReadOnlyCollection<ThreadPost> threads)
{
static void DetachThenReplace(
EntityEntry<LatestReplier> entityEntry,
ThreadPost thread,
LatestReplier newLatestReplier)
{
entityEntry.State = EntityState.Detached;
thread.LatestReplier = newLatestReplier;
}

var threadsGroupByUniqueLatestReplier = threads
var uniqueLatestRepliers = threads
.Where(th => th.LatestReplier != null)
.GroupBy(UniqueLatestReplier.FromThread).ToList();
threadsGroupByUniqueLatestReplier.ForEach(g =>
(from thread in g.Skip(1)
join entityEntry in db.ChangeTracker.Entries<LatestReplier>()
on thread.LatestReplier equals entityEntry.Entity
select (thread, entityEntry))
.ForEach(t => DetachThenReplace(t.entityEntry, t.thread, g.First().LatestReplier!)));

var uniqueLatestRepliers = threadsGroupByUniqueLatestReplier.Select(g => g.Key).ToList();
var existingLatestRepliers = db.LatestRepliers.AsNoTracking().FilterByItems(
.Select(UniqueLatestReplier.FromThread).ToList();
var existingLatestRepliers = db.LatestRepliers.AsTracking().FilterByItems(
uniqueLatestRepliers, (latestReplier, uniqueLatestReplier) =>
latestReplier.Name == uniqueLatestReplier.Name
&& latestReplier.DisplayName == uniqueLatestReplier.DisplayName)
Expand All @@ -43,7 +24,11 @@ on UniqueLatestReplier.FromLatestReplier(existing) equals UniqueLatestReplier.Fr
join entityEntry in db.ChangeTracker.Entries<LatestReplier>()
on thread.LatestReplier equals entityEntry.Entity // Object.ReferenceEquals()
select (existing, thread, entityEntry))
.ForEach(t => DetachThenReplace(t.entityEntry, t.thread, t.existing));
.ForEach(t =>
{
t.entityEntry.State = EntityState.Detached;
t.thread.LatestReplier = t.existing;
});

_ = _saverLocks.Value.Acquire(uniqueLatestRepliers
.Except(existingLatestRepliers.Select(UniqueLatestReplier.FromLatestReplier))
Expand Down
14 changes: 10 additions & 4 deletions c#/crawler/src/Tieba/Crawl/Saver/SaverChangeSet.cs
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
namespace tbm.Crawler.Tieba.Crawl.Saver;

public class SaverChangeSet<TPost>(
Func<TPost, PostId> postIdSelector,
IReadOnlyCollection<TPost> existingBefore,
ICollection<TPost> existingAfterAndNewlyAdded,
Func<TPost, PostId> postIdSelector)
IReadOnlyCollection<TPost> existingAfterInTracking)
where TPost : BasePost
{
public IReadOnlyCollection<(TPost Before, TPost After)> Existing { get; } = existingBefore
.OrderBy(postIdSelector)
.EquiZip(existingAfterAndNewlyAdded
.EquiZip(existingAfterInTracking
.IntersectBy(existingBefore.Select(postIdSelector), postIdSelector)
.OrderBy(postIdSelector),
(before, after) => (before, after))
Expand All @@ -18,6 +19,11 @@ public class SaverChangeSet<TPost>(
.ExceptBy(existingBefore.Select(postIdSelector), postIdSelector)
.ToList().AsReadOnly();

public IReadOnlyCollection<TPost> AllAfter { get; } = existingAfterAndNewlyAdded
.ToList().AsReadOnly();
// https://stackoverflow.com/questions/3404975/left-outer-join-in-linq/23558389#23558389
public IReadOnlyCollection<TPost> AllAfter { get; } = (
from nonTracked in existingAfterAndNewlyAdded
join inTracking in existingAfterInTracking
on postIdSelector(nonTracked) equals postIdSelector(inTracking) into inTrackings
from inTracking in inTrackings.DefaultIfEmpty()
select inTracking ?? nonTracked).ToList().AsReadOnly();
}
7 changes: 7 additions & 0 deletions c#/crawler/src/Tieba/Crawl/Saver/SaverWithRevision.cs
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,13 @@ protected void SaveEntitiesWithRevision<TEntity, TRevision>(
entityEntry.CurrentValues.SetValues(newEntity); // mutate existingEntity that referenced by entry
entityEntry.Property(e => e.Version).IsModified = false; // newEntity.Version will always be default 0
// https://stackoverflow.com/questions/66206459/update-navigation-property-with-entity-currentvalues-setvalues/66491805#66491805
(from newNavigation in db.Entry(newEntity).Navigations
join existingNavigation in entityEntry.Navigations
on newNavigation.Metadata.Name equals existingNavigation.Metadata.Name
select (newNavigation, existingNavigation))
.ForEach(t => t.existingNavigation.CurrentValue = t.newNavigation.CurrentValue);
bool IsTimestampingFieldName(string name) => name is nameof(BasePost.LastSeenAt)
or nameof(TimestampedEntity.CreatedAt) or nameof(TimestampedEntity.UpdatedAt);
Expand Down

0 comments on commit b81b64a

Please sign in to comment.