From ccb8a4e3f46e4d42b8ae341796e9ee2aaa9a72d3 Mon Sep 17 00:00:00 2001 From: LittleSheep Date: Mon, 30 Jun 2025 23:26:05 +0800 Subject: [PATCH] :sparkles: Bug fixes on web feed & scraping --- DysonNetwork.Sphere/Connection/WebReader/WebArticle.cs | 4 ++-- .../Connection/WebReader/WebFeedController.cs | 3 ++- .../Connection/WebReader/WebFeedService.cs | 6 +++++- .../Connection/WebReader/WebReaderService.cs | 9 +++++++-- 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/DysonNetwork.Sphere/Connection/WebReader/WebArticle.cs b/DysonNetwork.Sphere/Connection/WebReader/WebArticle.cs index 2a874f2..9d0d089 100644 --- a/DysonNetwork.Sphere/Connection/WebReader/WebArticle.cs +++ b/DysonNetwork.Sphere/Connection/WebReader/WebArticle.cs @@ -31,8 +31,8 @@ public class WebFeedConfig public class WebFeed : ModelBase { public Guid Id { get; set; } = Guid.NewGuid(); - [MaxLength(8192)] public string Url { get; set; } - [MaxLength(4096)] public string Title { get; set; } + [MaxLength(8192)] public string Url { get; set; } = null!; + [MaxLength(4096)] public string Title { get; set; } = null!; [MaxLength(8192)] public string? Description { get; set; } [Column(TypeName = "jsonb")] public LinkEmbed? Preview { get; set; } diff --git a/DysonNetwork.Sphere/Connection/WebReader/WebFeedController.cs b/DysonNetwork.Sphere/Connection/WebReader/WebFeedController.cs index 56ff646..a93d725 100644 --- a/DysonNetwork.Sphere/Connection/WebReader/WebFeedController.cs +++ b/DysonNetwork.Sphere/Connection/WebReader/WebFeedController.cs @@ -13,7 +13,8 @@ public class WebFeedController(WebFeedService webFeed, PublisherService ps) : Co public record WebFeedRequest( [MaxLength(8192)] string? Url, [MaxLength(4096)] string? Title, - [MaxLength(8192)] string? Description + [MaxLength(8192)] string? Description, + WebFeedConfig? Config ); [HttpGet] diff --git a/DysonNetwork.Sphere/Connection/WebReader/WebFeedService.cs b/DysonNetwork.Sphere/Connection/WebReader/WebFeedService.cs index 70cc273..6547de8 100644 --- a/DysonNetwork.Sphere/Connection/WebReader/WebFeedService.cs +++ b/DysonNetwork.Sphere/Connection/WebReader/WebFeedService.cs @@ -11,13 +11,15 @@ public class WebFeedService( WebReaderService webReaderService ) { - public async Task CreateWebFeedAsync(Publisher.Publisher publisher, WebFeedController.WebFeedRequest request) + public async Task CreateWebFeedAsync(Publisher.Publisher publisher, + WebFeedController.WebFeedRequest request) { var feed = new WebFeed { Url = request.Url!, Title = request.Title!, Description = request.Description, + Config = request.Config ?? new WebFeedConfig(), PublisherId = publisher.Id, }; @@ -48,6 +50,8 @@ public class WebFeedService( feed.Title = request.Title; if (request.Description is not null) feed.Description = request.Description; + if (request.Config is not null) + feed.Config = request.Config; database.Update(feed); await database.SaveChangesAsync(); diff --git a/DysonNetwork.Sphere/Connection/WebReader/WebReaderService.cs b/DysonNetwork.Sphere/Connection/WebReader/WebReaderService.cs index 040b48e..d7f9bda 100644 --- a/DysonNetwork.Sphere/Connection/WebReader/WebReaderService.cs +++ b/DysonNetwork.Sphere/Connection/WebReader/WebReaderService.cs @@ -33,7 +33,11 @@ public class WebReaderService( { var httpClient = httpClientFactory.CreateClient("WebReader"); var response = await httpClient.GetAsync(url, cancellationToken); - response.EnsureSuccessStatusCode(); + if (!response.IsSuccessStatusCode) + { + logger.LogWarning("Failed to scrap article content for URL: {Url}", url); + return null; + } var html = await response.Content.ReadAsStringAsync(cancellationToken); var doc = new HtmlDocument(); doc.LoadHtml(html); @@ -74,7 +78,8 @@ public class WebReaderService( // Cache miss or bypass, fetch fresh data logger.LogDebug("Fetching fresh link preview for URL: {Url}", url); var httpClient = httpClientFactory.CreateClient("WebReader"); - httpClient.MaxResponseContentBufferSize = 10 * 1024 * 1024; // 10MB, prevent scrap some directly accessible files + httpClient.MaxResponseContentBufferSize = + 10 * 1024 * 1024; // 10MB, prevent scrap some directly accessible files httpClient.Timeout = TimeSpan.FromSeconds(3); // Setting UA to facebook's bot to get the opengraph. httpClient.DefaultRequestHeaders.Add("User-Agent", "facebookexternalhit/1.1");