Skip to content

Commit

Permalink
- method ParseLatestRepliers()
Browse files Browse the repository at this point in the history
- method `OnBeforeCommitSave()` and it's overridden parent in abstract class `CrawlFacade`
@ ThreadCrawlFacade.cs

- method `ShouldIgnoreEntityRevision()` and its overridden parent in abstract class ` SaverWithRevision` @ UserSaver.cs
- method `CreateLatestReplier()` & nested class `EqualityComparer` @ User.cs
@ c#
  • Loading branch information
n0099 committed Jul 10, 2024
1 parent 4274baa commit 10d5cbd
Show file tree
Hide file tree
Showing 6 changed files with 0 additions and 82 deletions.
30 changes: 0 additions & 30 deletions c#/crawler/src/Db/User.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,34 +9,4 @@ public class User : BaseUser
public string? FansNickname { get; set; }
public byte[]? Icon { get; set; }
public string? IpGeolocation { get; set; }

public static User CreateLatestReplier(long uid, string? name, string? displayName) =>
new() {Uid = uid, Name = name, DisplayName = displayName, Portrait = ""};

public class EqualityComparer : EqualityComparer<User>
{
public static EqualityComparer Instance { get; } = new();

public override bool Equals(User? x, User? y) => x == y || (
x != null && y != null &&
(x.Uid, x.Name, x.DisplayName, x.Portrait, x.PortraitUpdatedAt, x.Gender, x.FansNickname, x.IpGeolocation)
== (y.Uid, y.Name, y.DisplayName, y.Portrait, y.PortraitUpdatedAt, y.Gender, y.FansNickname, y.IpGeolocation)
&& (x.Icon == y.Icon
|| (x.Icon != null && y.Icon != null && ByteArrayEqualityComparer.Instance.Equals(x.Icon, y.Icon))));

public override int GetHashCode(User obj)
{
var hash = default(HashCode);
hash.Add(obj.Uid);
hash.Add(obj.Name);
hash.Add(obj.DisplayName);
hash.Add(obj.Portrait);
hash.Add(obj.PortraitUpdatedAt);
hash.Add(obj.Gender);
hash.Add(obj.FansNickname);
hash.AddBytes(obj.Icon);
hash.Add(obj.IpGeolocation);
return hash.ToHashCode();
}
}
}
3 changes: 0 additions & 3 deletions c#/crawler/src/Tieba/Crawl/Facade/CrawlFacade.cs
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,6 @@ public virtual void Dispose()
postSaver.UserFieldUpdateIgnorance,
postSaver.UserFieldRevisionIgnorance);

OnBeforeCommitSave(db, userSaver);

db.TimestampingEntities();
_ = db.SaveChanges();
transaction.Commit();
Expand Down Expand Up @@ -133,7 +131,6 @@ protected virtual void OnPostParse(
TResponse response,
CrawlRequestFlag flag,
IReadOnlyDictionary<PostId, TPost> parsedPostsInResponse) { }
protected virtual void OnBeforeCommitSave(CrawlerDbContext db, UserSaver userSaver) { }
protected virtual void OnPostCommitSave(
SaverChangeSet<TPost> savedPosts,
CancellationToken stoppingToken = default) { }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ protected override void OnPostParse(
if (flag == CrawlRequestFlag.ThreadClientVersion602) return;
var data = response.Data;
UserParser.Parse(data.ThreadList.Select(th => th.Author));
ParseLatestRepliers(data.ThreadList);
FillFromRequestingWith602(data.ThreadList);

// parsed author uid will be 0 when request with client version 6.0.2
Expand Down
34 changes: 0 additions & 34 deletions c#/crawler/src/Tieba/Crawl/Facade/ThreadCrawlFacade.cs
Original file line number Diff line number Diff line change
Expand Up @@ -18,28 +18,6 @@ public class ThreadCrawlFacade(

public delegate ThreadCrawlFacade New(Fid fid, string forumName);

protected override void OnBeforeCommitSave(CrawlerDbContext db, UserSaver userSaver)
{ // OnBeforeCommitSave() should get invoked after UserSaver.Save() by the base.SaveCrawled()
// so only latest repliers that not exists in parsed users are being inserted
// note this will bypass user revision detection since not invoking SaverWithRevision.SaveEntitiesWithRevision() but directly DbContext.AddRange()

// users has already been added into DbContext and tracking
var existingUsersId = db.ChangeTracker.Entries<User>().Select(ee => ee.Entity.Uid);
var newLatestRepliers = _latestRepliers.ExceptByKey(existingUsersId).Values().ToList();
if (newLatestRepliers.Count == 0) return;

var newlyLockedLatestRepliers = userSaver.AcquireUidLocksForSave
(newLatestRepliers.Select(u => u.Uid));
var newLatestRepliersExceptLocked = newLatestRepliers
.IntersectBy(newlyLockedLatestRepliers, u => u.Uid)
.Select(u =>
{
u.CreatedAt = SharedHelper.GetNowTimestamp();
return u;
});
db.Users.AddRange(newLatestRepliersExceptLocked);
}

protected override void OnPostParse(
ThreadResponse response,
CrawlRequestFlag flag,
Expand All @@ -50,24 +28,12 @@ protected override void OnPostParse(
if (flag != CrawlRequestFlag.None) return;
UserParser.Parse(data.UserList);
UserParser.ResetUsersIcon();
ParseLatestRepliers(data.ThreadList);

// remove livepost threads since their real parent forum may not match with current crawling fid
data.ThreadList.Where(th => th.LivePostType != "")
.ForEach(th => Posts.TryRemove((Tid)th.Tid, out _));
}

protected void ParseLatestRepliers(IEnumerable<Thread> threads) =>
threads.Select(th => th.LastReplyer ?? null) // LastReplyer will be null when LivePostType != ""
.OfType<TbClient.User>() // filter out nulls

// some rare deleted thread but still visible in 6.0.2 response
// will have the latest replier uid=0 name="" nameShow=".*"
.Where(u => u.Uid != 0)
.Select(u => User.CreateLatestReplier(u.Uid, u.Name.NullIfEmpty(),
u.Name == u.NameShow ? null : u.NameShow))
.ForEach(u => _latestRepliers[u.Uid] = u);

protected void FillFromRequestingWith602(IEnumerable<Thread> threads) =>
(from inResponse in threads
join parsed in Posts.Values on (Tid)inResponse.Tid equals parsed.Tid
Expand Down
2 changes: 0 additions & 2 deletions c#/crawler/src/Tieba/Crawl/Saver/SaverWithRevision.cs
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ protected class RevisionIdWithDuplicateIndexProjection
}
public partial class SaverWithRevision<TBaseRevision, TRevisionId>
{
protected virtual bool ShouldIgnoreEntityRevision(string propName, PropertyEntry propEntry, EntityEntry entityEntry) => false;
protected virtual bool FieldUpdateIgnorance(string propName, object? oldValue, object? newValue) => false;
protected virtual bool FieldRevisionIgnorance(string propName, object? oldValue, object? newValue) => false;
private static bool GlobalFieldUpdateIgnorance(string propName, object? oldValue, object? newValue) => propName switch
Expand Down Expand Up @@ -107,7 +106,6 @@ bool IsTimestampingFieldName(string name) => name is nameof(BasePost.LastSeenAt)
{
var pName = p.Metadata.Name;
if (!p.IsModified || IsTimestampingFieldName(pName)) continue;
if (ShouldIgnoreEntityRevision(pName, p, entityEntry)) return null;
if (FieldUpdateIgnorance(
pName, p.OriginalValue, p.CurrentValue)
Expand Down
12 changes: 0 additions & 12 deletions c#/crawler/src/Tieba/Crawl/Saver/UserSaver.cs
Original file line number Diff line number Diff line change
Expand Up @@ -70,18 +70,6 @@ protected override Expression<Func<BaseUserRevision, RevisionIdWithDuplicateInde
}
public partial class UserSaver
{
protected override bool ShouldIgnoreEntityRevision(string propName, PropertyEntry propEntry, EntityEntry entityEntry)
{
// ThreadCrawlFacade.ParseLatestRepliers() will save partial filled user of latest repliers for livepost thread
// they may later get updated by (sub) reply crawler after it find out the latest reply
// so we should ignore its revision update for all fields
if (propName != nameof(User.Portrait) || propEntry.OriginalValue is not "") return false;
var user = (User)entityEntry.OriginalValues.ToObject();
var latestReplier = User.CreateLatestReplier(user.Uid, user.Name, user.DisplayName);

return User.EqualityComparer.Instance.Equals(user, latestReplier);
}

protected override bool FieldUpdateIgnorance
(string propName, object? oldValue, object? newValue) => propName switch
{ // possible randomly respond with null
Expand Down

0 comments on commit 10d5cbd

Please sign in to comment.