From f170793928af10360ea5f846a2de685cbfff7e29 Mon Sep 17 00:00:00 2001 From: LittleSheep Date: Thu, 26 Jun 2025 18:34:51 +0800 Subject: [PATCH] :lipstick: Optimized web articles --- .../Connection/WebReader/ScrapedArticle.cs | 7 +++++ .../Connection/WebReader/WebArticle.cs | 6 ++++ .../Connection/WebReader/WebFeedController.cs | 31 ++++++++++++++++++- .../Connection/WebReader/WebFeedService.cs | 30 +++++++++++++----- .../Connection/WebReader/WebReaderService.cs | 25 +++++++++++++++ .../DysonNetwork.Sphere.csproj | 1 + DysonNetwork.sln.DotSettings.user | 1 + 7 files changed, 93 insertions(+), 8 deletions(-) create mode 100644 DysonNetwork.Sphere/Connection/WebReader/ScrapedArticle.cs diff --git a/DysonNetwork.Sphere/Connection/WebReader/ScrapedArticle.cs b/DysonNetwork.Sphere/Connection/WebReader/ScrapedArticle.cs new file mode 100644 index 0000000..6c39028 --- /dev/null +++ b/DysonNetwork.Sphere/Connection/WebReader/ScrapedArticle.cs @@ -0,0 +1,7 @@ +namespace DysonNetwork.Sphere.Connection.WebReader; + +public class ScrapedArticle +{ + public LinkEmbed LinkEmbed { get; set; } = null!; + public string? Content { get; set; } +} \ No newline at end of file diff --git a/DysonNetwork.Sphere/Connection/WebReader/WebArticle.cs b/DysonNetwork.Sphere/Connection/WebReader/WebArticle.cs index 3172f35..2a874f2 100644 --- a/DysonNetwork.Sphere/Connection/WebReader/WebArticle.cs +++ b/DysonNetwork.Sphere/Connection/WebReader/WebArticle.cs @@ -23,6 +23,11 @@ public class WebArticle : ModelBase public WebFeed Feed { get; set; } = null!; } +public class WebFeedConfig +{ + public bool ScrapPage { get; set; } +} + public class WebFeed : ModelBase { public Guid Id { get; set; } = Guid.NewGuid(); @@ -31,6 +36,7 @@ public class WebFeed : ModelBase [MaxLength(8192)] public string? Description { get; set; } [Column(TypeName = "jsonb")] public LinkEmbed? Preview { get; set; } + [Column(TypeName = "jsonb")] public WebFeedConfig Config { get; set; } = new(); public Guid PublisherId { get; set; } public Publisher.Publisher Publisher { get; set; } = null!; diff --git a/DysonNetwork.Sphere/Connection/WebReader/WebFeedController.cs b/DysonNetwork.Sphere/Connection/WebReader/WebFeedController.cs index 785ff95..0db67fa 100644 --- a/DysonNetwork.Sphere/Connection/WebReader/WebFeedController.cs +++ b/DysonNetwork.Sphere/Connection/WebReader/WebFeedController.cs @@ -1,13 +1,15 @@ using System.ComponentModel.DataAnnotations; +using DysonNetwork.Sphere.Permission; using Microsoft.AspNetCore.Authorization; using Microsoft.AspNetCore.Mvc; +using Microsoft.EntityFrameworkCore; namespace DysonNetwork.Sphere.Connection.WebReader; [Authorize] [ApiController] [Route("feeds")] -public class WebFeedController(WebFeedService webFeedService) : ControllerBase +public class WebFeedController(WebFeedService webFeedService, AppDatabase database) : ControllerBase { public class CreateWebFeedRequest { @@ -30,4 +32,31 @@ public class WebFeedController(WebFeedService webFeedService) : ControllerBase var feed = await webFeedService.CreateWebFeedAsync(request, User); return Ok(feed); } + + [HttpPost("scrape/{feedId}")] + [RequiredPermission("maintenance", "web-feeds")] + public async Task ScrapeFeed(Guid feedId) + { + var feed = await database.Set().FindAsync(feedId); + if (feed == null) + { + return NotFound(); + } + + await webFeedService.ScrapeFeedAsync(feed); + return Ok(); + } + + [HttpPost("scrape-all")] + [RequiredPermission("maintenance", "web-feeds")] + public async Task ScrapeAllFeeds() + { + var feeds = await database.Set().ToListAsync(); + foreach (var feed in feeds) + { + await webFeedService.ScrapeFeedAsync(feed); + } + + return Ok(); + } } diff --git a/DysonNetwork.Sphere/Connection/WebReader/WebFeedService.cs b/DysonNetwork.Sphere/Connection/WebReader/WebFeedService.cs index 6fdc21e..7b36f1e 100644 --- a/DysonNetwork.Sphere/Connection/WebReader/WebFeedService.cs +++ b/DysonNetwork.Sphere/Connection/WebReader/WebFeedService.cs @@ -11,10 +11,11 @@ public class WebFeedService( AppDatabase database, IHttpClientFactory httpClientFactory, ILogger logger, - AccountService accountService -) + AccountService accountService, + WebReaderService webReaderService + ) { - public async Task CreateWebFeedAsync(WebFeedController.CreateWebFeedRequest dto, ClaimsPrincipal claims) + public async Task CreateWebFeedAsync(WebFeedController.CreateWebFeedRequest request, ClaimsPrincipal claims) { if (claims.Identity?.Name == null) { @@ -29,9 +30,9 @@ public class WebFeedService( var feed = new WebFeed { - Url = dto.Url, - Title = dto.Title, - Description = dto.Description, + Url = request.Url, + Title = request.Title, + Description = request.Description, PublisherId = account.Id, }; @@ -73,14 +74,29 @@ public class WebFeedService( continue; } + var content = (item.Content as TextSyndicationContent)?.Text ?? item.Summary.Text; + LinkEmbed preview; + + if (feed.Config.ScrapPage) + { + var scrapedArticle = await webReaderService.ScrapeArticleAsync(itemUrl, cancellationToken); + preview = scrapedArticle.LinkEmbed; + content = scrapedArticle.Content; + } + else + { + preview = await webReaderService.GetLinkPreviewAsync(itemUrl, cancellationToken); + } + var newArticle = new WebArticle { FeedId = feed.Id, Title = item.Title.Text, Url = itemUrl, Author = item.Authors.FirstOrDefault()?.Name, - Content = (item.Content as TextSyndicationContent)?.Text ?? item.Summary.Text, + Content = content, PublishedAt = item.PublishDate.UtcDateTime, + Preview = preview, }; database.Set().Add(newArticle); diff --git a/DysonNetwork.Sphere/Connection/WebReader/WebReaderService.cs b/DysonNetwork.Sphere/Connection/WebReader/WebReaderService.cs index 2947b26..040b48e 100644 --- a/DysonNetwork.Sphere/Connection/WebReader/WebReaderService.cs +++ b/DysonNetwork.Sphere/Connection/WebReader/WebReaderService.cs @@ -2,6 +2,7 @@ using System.Globalization; using AngleSharp; using AngleSharp.Dom; using DysonNetwork.Sphere.Storage; +using HtmlAgilityPack; namespace DysonNetwork.Sphere.Connection.WebReader; @@ -17,6 +18,30 @@ public class WebReaderService( private const string LinkPreviewCachePrefix = "scrap:preview:"; private const string LinkPreviewCacheGroup = "scrap:preview"; + public async Task ScrapeArticleAsync(string url, CancellationToken cancellationToken = default) + { + var linkEmbed = await GetLinkPreviewAsync(url, cancellationToken); + var content = await GetArticleContentAsync(url, cancellationToken); + return new ScrapedArticle + { + LinkEmbed = linkEmbed, + Content = content + }; + } + + private async Task GetArticleContentAsync(string url, CancellationToken cancellationToken) + { + var httpClient = httpClientFactory.CreateClient("WebReader"); + var response = await httpClient.GetAsync(url, cancellationToken); + response.EnsureSuccessStatusCode(); + var html = await response.Content.ReadAsStringAsync(cancellationToken); + var doc = new HtmlDocument(); + doc.LoadHtml(html); + var articleNode = doc.DocumentNode.SelectSingleNode("//article"); + return articleNode?.InnerHtml; + } + + /// /// Generate a link preview embed from a URL /// diff --git a/DysonNetwork.Sphere/DysonNetwork.Sphere.csproj b/DysonNetwork.Sphere/DysonNetwork.Sphere.csproj index 0010bfa..ac0f4ae 100644 --- a/DysonNetwork.Sphere/DysonNetwork.Sphere.csproj +++ b/DysonNetwork.Sphere/DysonNetwork.Sphere.csproj @@ -23,6 +23,7 @@ + diff --git a/DysonNetwork.sln.DotSettings.user b/DysonNetwork.sln.DotSettings.user index b7e3e51..23b3a09 100644 --- a/DysonNetwork.sln.DotSettings.user +++ b/DysonNetwork.sln.DotSettings.user @@ -78,6 +78,7 @@ ForceIncluded ForceIncluded ForceIncluded + ForceIncluded ForceIncluded ForceIncluded ForceIncluded