♻️ I have no idea what am I doing. Might be mixing stuff

2025-07-14 19:55:28 +08:00
parent ef9175d27d
commit cbfdb4aa60
232 changed files with 990 additions and 115807 deletions
--- a/DysonNetwork.Sphere/WebReader/EmbeddableBase.cs
+++ b/DysonNetwork.Sphere/WebReader/EmbeddableBase.cs
@@ -0,0 +1,44 @@
+using System.Reflection;
+using System.Text.Json.Serialization;
+
+namespace DysonNetwork.Sphere.WebReader;
+
+/// <summary>
+/// The embeddable can be used in the post or messages' meta's embeds fields
+/// To render a richer type of content.
+///
+/// A simple example of using link preview embed:
+/// <code>
+/// {
+///     // ... post content
+///     "meta": {
+///         "embeds": [
+///             {
+///                 "type": "link",
+///                 "title: "...",
+///                 /// ...
+///             }
+///         ]
+///     }
+/// }
+/// </code>
+/// </summary>
+public abstract class EmbeddableBase
+{
+    public abstract string Type { get; }
+
+    public Dictionary<string, object> ToDictionary()
+    {
+        var dict = new Dictionary<string, object>();
+        foreach (var prop in GetType().GetProperties())
+        {
+            if (prop.GetCustomAttribute<JsonIgnoreAttribute>() is not null)
+                continue;
+            var value = prop.GetValue(this);
+            if (value is null) continue;
+            dict[prop.Name] = value;
+        }
+
+        return dict;
+    }
+}
--- a/DysonNetwork.Sphere/WebReader/LinkEmbed.cs
+++ b/DysonNetwork.Sphere/WebReader/LinkEmbed.cs
@@ -0,0 +1,55 @@
+namespace DysonNetwork.Sphere.WebReader;
+
+/// <summary>
+/// The link embed is a part of the embeddable implementations
+/// It can be used in the post or messages' meta's embeds fields
+/// </summary>
+public class LinkEmbed : EmbeddableBase
+{
+    public override string Type => "link";
+
+    /// <summary>
+    /// The original URL that was processed
+    /// </summary>
+    public required string Url { get; set; }
+
+    /// <summary>
+    /// Title of the linked content (from OpenGraph og:title, meta title, or page title)
+    /// </summary>
+    public string? Title { get; set; }
+
+    /// <summary>
+    /// Description of the linked content (from OpenGraph og:description or meta description)
+    /// </summary>
+    public string? Description { get; set; }
+
+    /// <summary>
+    /// URL to the thumbnail image (from OpenGraph og:image or other meta tags)
+    /// </summary>
+    public string? ImageUrl { get; set; }
+
+    /// <summary>
+    /// The favicon URL of the site
+    /// </summary>
+    public string? FaviconUrl { get; set; }
+
+    /// <summary>
+    /// The site name (from OpenGraph og:site_name)
+    /// </summary>
+    public string? SiteName { get; set; }
+
+    /// <summary>
+    /// Type of the content (from OpenGraph og:type)
+    /// </summary>
+    public string? ContentType { get; set; }
+
+    /// <summary>
+    /// Author of the content if available
+    /// </summary>
+    public string? Author { get; set; }
+
+    /// <summary>
+    /// Published date of the content if available
+    /// </summary>
+    public DateTime? PublishedDate { get; set; }
+}
--- a/DysonNetwork.Sphere/WebReader/ScrapedArticle.cs
+++ b/DysonNetwork.Sphere/WebReader/ScrapedArticle.cs
@@ -0,0 +1,7 @@
+namespace DysonNetwork.Sphere.WebReader;
+
+public class ScrapedArticle
+{
+    public LinkEmbed LinkEmbed { get; set; } = null!;
+    public string? Content { get; set; }
+}
--- a/DysonNetwork.Sphere/WebReader/WebArticle.cs
+++ b/DysonNetwork.Sphere/WebReader/WebArticle.cs
@@ -0,0 +1,46 @@
+using System.ComponentModel.DataAnnotations;
+using System.ComponentModel.DataAnnotations.Schema;
+using System.Text.Json.Serialization;
+
+namespace DysonNetwork.Sphere.WebReader;
+
+public class WebArticle : ModelBase
+{
+    public Guid Id { get; set; } = Guid.NewGuid();
+
+    [MaxLength(4096)] public string Title { get; set; } = null!;
+    [MaxLength(8192)] public string Url { get; set; } = null!;
+    [MaxLength(4096)] public string? Author { get; set; }
+    
+    [Column(TypeName = "jsonb")] public Dictionary<string, object>? Meta { get; set; }
+    [Column(TypeName = "jsonb")] public LinkEmbed? Preview { get; set; }
+
+    // ReSharper disable once EntityFramework.ModelValidation.UnlimitedStringLength
+    public string? Content { get; set; }
+
+    public DateTime? PublishedAt { get; set; }
+
+    public Guid FeedId { get; set; }
+    public WebFeed Feed { get; set; } = null!;
+}
+
+public class WebFeedConfig
+{
+    public bool ScrapPage { get; set; }
+}
+
+public class WebFeed : ModelBase
+{
+    public Guid Id { get; set; } = Guid.NewGuid();
+    [MaxLength(8192)] public string Url { get; set; } = null!;
+    [MaxLength(4096)] public string Title { get; set; } = null!;
+    [MaxLength(8192)] public string? Description { get; set; }
+    
+    [Column(TypeName = "jsonb")] public LinkEmbed? Preview { get; set; }
+    [Column(TypeName = "jsonb")] public WebFeedConfig Config { get; set; } = new();
+
+    public Guid PublisherId { get; set; }
+    public Publisher.Publisher Publisher { get; set; } = null!;
+
+    [JsonIgnore] public ICollection<WebArticle> Articles { get; set; } = new List<WebArticle>();
+}
--- a/DysonNetwork.Sphere/WebReader/WebArticleController.cs
+++ b/DysonNetwork.Sphere/WebReader/WebArticleController.cs
@@ -0,0 +1,82 @@
+using Microsoft.AspNetCore.Mvc;
+using Microsoft.EntityFrameworkCore;
+
+namespace DysonNetwork.Sphere.WebReader;
+
+[ApiController]
+[Route("/api/feeds/articles")]
+public class WebArticleController(AppDatabase db) : ControllerBase
+{
+    /// <summary>
+    /// Get a list of recent web articles
+    /// </summary>
+    /// <param name="limit">Maximum number of articles to return</param>
+    /// <param name="offset">Number of articles to skip</param>
+    /// <param name="feedId">Optional feed ID to filter by</param>
+    /// <param name="publisherId">Optional publisher ID to filter by</param>
+    /// <returns>List of web articles</returns>
+    [HttpGet]
+    public async Task<IActionResult> GetArticles(
+        [FromQuery] int limit = 20,
+        [FromQuery] int offset = 0,
+        [FromQuery] Guid? feedId = null,
+        [FromQuery] Guid? publisherId = null
+    )
+    {
+        var query = db.WebArticles
+            .OrderByDescending(a => a.PublishedAt)
+            .Include(a => a.Feed)
+            .AsQueryable();
+
+        if (feedId.HasValue)
+            query = query.Where(a => a.FeedId == feedId.Value);
+        if (publisherId.HasValue)
+            query = query.Where(a => a.Feed.PublisherId == publisherId.Value);
+
+        var totalCount = await query.CountAsync();
+        var articles = await query
+            .Skip(offset)
+            .Take(limit)
+            .ToListAsync();
+        
+        Response.Headers["X-Total"] = totalCount.ToString();
+
+        return Ok(articles);
+    }
+
+    /// <summary>
+    /// Get a specific web article by ID
+    /// </summary>
+    /// <param name="id">The article ID</param>
+    /// <returns>The web article</returns>
+    [HttpGet("{id:guid}")]
+    [ProducesResponseType(404)]
+    public async Task<IActionResult> GetArticle(Guid id)
+    {
+        var article = await db.WebArticles
+            .Include(a => a.Feed)
+            .FirstOrDefaultAsync(a => a.Id == id);
+
+        if (article == null)
+            return NotFound();
+        
+        return Ok(article);
+    }
+
+    /// <summary>
+    /// Get random web articles
+    /// </summary>
+    /// <param name="limit">Maximum number of articles to return</param>
+    /// <returns>List of random web articles</returns>
+    [HttpGet("random")]
+    public async Task<IActionResult> GetRandomArticles([FromQuery] int limit = 5)
+    {
+        var articles = await db.WebArticles
+            .OrderBy(_ => EF.Functions.Random())
+            .Include(a => a.Feed)
+            .Take(limit)
+            .ToListAsync();
+
+        return Ok(articles);
+    }
+}
--- a/DysonNetwork.Sphere/WebReader/WebFeedController.cs
+++ b/DysonNetwork.Sphere/WebReader/WebFeedController.cs
@@ -0,0 +1,129 @@
+using System.ComponentModel.DataAnnotations;
+using DysonNetwork.Shared.Proto;
+using DysonNetwork.Sphere.Publisher;
+using Microsoft.AspNetCore.Authorization;
+using Microsoft.AspNetCore.Mvc;
+
+namespace DysonNetwork.Sphere.WebReader;
+
+[Authorize]
+[ApiController]
+[Route("/api/publishers/{pubName}/feeds")]
+public class WebFeedController(WebFeedService webFeed, PublisherService ps) : ControllerBase
+{
+    public record WebFeedRequest(
+        [MaxLength(8192)] string? Url,
+        [MaxLength(4096)] string? Title,
+        [MaxLength(8192)] string? Description,
+        WebFeedConfig? Config
+    );
+
+    [HttpGet]
+    public async Task<IActionResult> ListFeeds([FromRoute] string pubName)
+    {
+        var publisher = await ps.GetPublisherByName(pubName);
+        if (publisher is null) return NotFound();
+        var feeds = await webFeed.GetFeedsByPublisherAsync(publisher.Id);
+        return Ok(feeds);
+    }
+
+    [HttpGet("{id:guid}")]
+    public async Task<IActionResult> GetFeed([FromRoute] string pubName, Guid id)
+    {
+        var publisher = await ps.GetPublisherByName(pubName);
+        if (publisher is null) return NotFound();
+
+        var feed = await webFeed.GetFeedAsync(id, publisherId: publisher.Id);
+        if (feed == null)
+            return NotFound();
+
+        return Ok(feed);
+    }
+
+    [HttpPost]
+    [Authorize]
+    public async Task<IActionResult> CreateWebFeed([FromRoute] string pubName, [FromBody] WebFeedRequest request)
+    {
+        if (HttpContext.Items["CurrentUser"] is not Account currentUser) return Unauthorized();
+
+        if (string.IsNullOrWhiteSpace(request.Url) || string.IsNullOrWhiteSpace(request.Title))
+            return BadRequest("Url and title are required");
+
+        var publisher = await ps.GetPublisherByName(pubName);
+        if (publisher is null) return NotFound();
+
+        var accountId = Guid.Parse(currentUser.Id);
+        if (!await ps.IsMemberWithRole(publisher.Id, accountId, PublisherMemberRole.Editor))
+            return StatusCode(403, "You must be an editor of the publisher to create a web feed");
+
+        var feed = await webFeed.CreateWebFeedAsync(publisher, request);
+        return Ok(feed);
+    }
+
+    [HttpPatch("{id:guid}")]
+    [Authorize]
+    public async Task<IActionResult> UpdateFeed([FromRoute] string pubName, Guid id, [FromBody] WebFeedRequest request)
+    {
+        if (HttpContext.Items["CurrentUser"] is not Account currentUser) return Unauthorized();
+
+        var publisher = await ps.GetPublisherByName(pubName);
+        if (publisher is null) return NotFound();
+
+        var accountId = Guid.Parse(currentUser.Id);
+        if (!await ps.IsMemberWithRole(publisher.Id, accountId, PublisherMemberRole.Editor))
+            return StatusCode(403, "You must be an editor of the publisher to update a web feed");
+
+        var feed = await webFeed.GetFeedAsync(id, publisherId: publisher.Id);
+        if (feed == null)
+            return NotFound();
+
+        feed = await webFeed.UpdateFeedAsync(feed, request);
+        return Ok(feed);
+    }
+
+    [HttpDelete("{id:guid}")]
+    [Authorize]
+    public async Task<IActionResult> DeleteFeed([FromRoute] string pubName, Guid id)
+    {
+        if (HttpContext.Items["CurrentUser"] is not Account currentUser) return Unauthorized();
+
+        var publisher = await ps.GetPublisherByName(pubName);
+        if (publisher is null) return NotFound();
+
+        var accountId = Guid.Parse(currentUser.Id);
+        if (!await ps.IsMemberWithRole(publisher.Id, accountId, PublisherMemberRole.Editor))
+            return StatusCode(403, "You must be an editor of the publisher to delete a web feed");
+
+        var feed = await webFeed.GetFeedAsync(id, publisherId: publisher.Id);
+        if (feed == null)
+            return NotFound();
+
+        var result = await webFeed.DeleteFeedAsync(id);
+        if (!result)
+            return NotFound();
+        return NoContent();
+    }
+
+    [HttpPost("{id:guid}/scrap")]
+    [Authorize]
+    public async Task<ActionResult> Scrap([FromRoute] string pubName, Guid id)
+    {
+        if (HttpContext.Items["CurrentUser"] is not Account currentUser) return Unauthorized();
+
+        var publisher = await ps.GetPublisherByName(pubName);
+        if (publisher is null) return NotFound();
+
+        var accountId = Guid.Parse(currentUser.Id);
+        if (!await ps.IsMemberWithRole(publisher.Id, accountId, PublisherMemberRole.Editor))
+            return StatusCode(403, "You must be an editor of the publisher to scrape a web feed");
+
+        var feed = await webFeed.GetFeedAsync(id, publisherId: publisher.Id);
+        if (feed == null)
+        {
+            return NotFound();
+        }
+
+        await webFeed.ScrapeFeedAsync(feed);
+        return Ok();
+    }
+}
--- a/DysonNetwork.Sphere/WebReader/WebFeedScraperJob.cs
+++ b/DysonNetwork.Sphere/WebReader/WebFeedScraperJob.cs
@@ -0,0 +1,35 @@
+using Microsoft.EntityFrameworkCore;
+using Microsoft.Extensions.Logging;
+using Quartz;
+
+namespace DysonNetwork.Sphere.WebReader;
+
+[DisallowConcurrentExecution]
+public class WebFeedScraperJob(
+    AppDatabase database,
+    WebFeedService webFeedService,
+    ILogger<WebFeedScraperJob> logger
+)
+    : IJob
+{
+    public async Task Execute(IJobExecutionContext context)
+    {
+        logger.LogInformation("Starting web feed scraper job.");
+
+        var feeds = await database.Set<WebFeed>().ToListAsync(context.CancellationToken);
+
+        foreach (var feed in feeds)
+        {
+            try
+            {
+                await webFeedService.ScrapeFeedAsync(feed, context.CancellationToken);
+            }
+            catch (Exception ex)
+            {
+                logger.LogError(ex, "Failed to scrape web feed {FeedId}", feed.Id);
+            }
+        }
+
+        logger.LogInformation("Web feed scraper job finished.");
+    }
+}
--- a/DysonNetwork.Sphere/WebReader/WebFeedService.cs
+++ b/DysonNetwork.Sphere/WebReader/WebFeedService.cs
@@ -0,0 +1,135 @@
+using System.ServiceModel.Syndication;
+using System.Xml;
+using Microsoft.EntityFrameworkCore;
+
+namespace DysonNetwork.Sphere.WebReader;
+
+public class WebFeedService(
+    AppDatabase database,
+    IHttpClientFactory httpClientFactory,
+    ILogger<WebFeedService> logger,
+    WebReaderService webReaderService
+)
+{
+    public async Task<WebFeed> CreateWebFeedAsync(Publisher.Publisher publisher,
+        WebFeedController.WebFeedRequest request)
+    {
+        var feed = new WebFeed
+        {
+            Url = request.Url!,
+            Title = request.Title!,
+            Description = request.Description,
+            Config = request.Config ?? new WebFeedConfig(),
+            PublisherId = publisher.Id,
+        };
+
+        database.Set<WebFeed>().Add(feed);
+        await database.SaveChangesAsync();
+
+        return feed;
+    }
+
+    public async Task<WebFeed?> GetFeedAsync(Guid id, Guid? publisherId = null)
+    {
+        var query = database.WebFeeds.Where(a => a.Id == id).AsQueryable();
+        if (publisherId.HasValue)
+            query = query.Where(a => a.PublisherId == publisherId.Value);
+        return await query.FirstOrDefaultAsync();
+    }
+
+    public async Task<List<WebFeed>> GetFeedsByPublisherAsync(Guid publisherId)
+    {
+        return await database.WebFeeds.Where(a => a.PublisherId == publisherId).ToListAsync();
+    }
+
+    public async Task<WebFeed> UpdateFeedAsync(WebFeed feed, WebFeedController.WebFeedRequest request)
+    {
+        if (request.Url is not null)
+            feed.Url = request.Url;
+        if (request.Title is not null)
+            feed.Title = request.Title;
+        if (request.Description is not null)
+            feed.Description = request.Description;
+        if (request.Config is not null)
+            feed.Config = request.Config;
+
+        database.Update(feed);
+        await database.SaveChangesAsync();
+
+        return feed;
+    }
+
+    public async Task<bool> DeleteFeedAsync(Guid id)
+    {
+        var feed = await database.WebFeeds.FindAsync(id);
+        if (feed == null)
+        {
+            return false;
+        }
+
+        database.WebFeeds.Remove(feed);
+        await database.SaveChangesAsync();
+
+        return true;
+    }
+
+    public async Task ScrapeFeedAsync(WebFeed feed, CancellationToken cancellationToken = default)
+    {
+        var httpClient = httpClientFactory.CreateClient();
+        var response = await httpClient.GetAsync(feed.Url, cancellationToken);
+        response.EnsureSuccessStatusCode();
+
+        await using var stream = await response.Content.ReadAsStreamAsync(cancellationToken);
+        using var reader = XmlReader.Create(stream);
+        var syndicationFeed = SyndicationFeed.Load(reader);
+
+        if (syndicationFeed == null)
+        {
+            logger.LogWarning("Could not parse syndication feed for {FeedUrl}", feed.Url);
+            return;
+        }
+
+        foreach (var item in syndicationFeed.Items)
+        {
+            var itemUrl = item.Links.FirstOrDefault()?.Uri.ToString();
+            if (string.IsNullOrEmpty(itemUrl))
+                continue;
+
+            var articleExists = await database.Set<WebArticle>()
+                .AnyAsync(a => a.FeedId == feed.Id && a.Url == itemUrl, cancellationToken);
+
+            if (articleExists)
+                continue;
+
+            var content = (item.Content as TextSyndicationContent)?.Text ?? item.Summary.Text;
+            LinkEmbed preview;
+
+            if (feed.Config.ScrapPage)
+            {
+                var scrapedArticle = await webReaderService.ScrapeArticleAsync(itemUrl, cancellationToken);
+                preview = scrapedArticle.LinkEmbed;
+                if (scrapedArticle.Content is not null)
+                    content = scrapedArticle.Content;
+            }
+            else
+            {
+                preview = await webReaderService.GetLinkPreviewAsync(itemUrl, cancellationToken);
+            }
+
+            var newArticle = new WebArticle
+            {
+                FeedId = feed.Id,
+                Title = item.Title.Text,
+                Url = itemUrl,
+                Author = item.Authors.FirstOrDefault()?.Name,
+                Content = content,
+                PublishedAt = item.LastUpdatedTime.UtcDateTime,
+                Preview = preview,
+            };
+
+            database.WebArticles.Add(newArticle);
+        }
+
+        await database.SaveChangesAsync(cancellationToken);
+    }
+}
--- a/DysonNetwork.Sphere/WebReader/WebReaderController.cs
+++ b/DysonNetwork.Sphere/WebReader/WebReaderController.cs
@@ -0,0 +1,110 @@
+using DysonNetwork.Sphere.Permission;
+using Microsoft.AspNetCore.Authorization;
+using Microsoft.AspNetCore.Mvc;
+using Microsoft.AspNetCore.RateLimiting;
+
+namespace DysonNetwork.Sphere.WebReader;
+
+/// <summary>
+/// Controller for web scraping and link preview services
+/// </summary>
+[ApiController]
+[Route("/api/scrap")]
+[EnableRateLimiting("fixed")]
+public class WebReaderController(WebReaderService reader, ILogger<WebReaderController> logger)
+    : ControllerBase
+{
+    /// <summary>
+    /// Retrieves a preview for the provided URL
+    /// </summary>
+    /// <param name="url">URL-encoded link to generate preview for</param>
+    /// <returns>Link preview data including title, description, and image</returns>
+    [HttpGet("link")]
+    public async Task<ActionResult<LinkEmbed>> ScrapLink([FromQuery] string url)
+    {
+        if (string.IsNullOrEmpty(url))
+        {
+            return BadRequest(new { error = "URL parameter is required" });
+        }
+
+        try
+        {
+            // Ensure URL is properly decoded
+            var decodedUrl = UrlDecoder.Decode(url);
+
+            // Validate URL format
+            if (!Uri.TryCreate(decodedUrl, UriKind.Absolute, out _))
+            {
+                return BadRequest(new { error = "Invalid URL format" });
+            }
+
+            var linkEmbed = await reader.GetLinkPreviewAsync(decodedUrl);
+            return Ok(linkEmbed);
+        }
+        catch (WebReaderException ex)
+        {
+            logger.LogWarning(ex, "Error scraping link: {Url}", url);
+            return BadRequest(new { error = ex.Message });
+        }
+        catch (Exception ex)
+        {
+            logger.LogError(ex, "Unexpected error scraping link: {Url}", url);
+            return StatusCode(StatusCodes.Status500InternalServerError, 
+                new { error = "An unexpected error occurred while processing the link" });
+        }
+    }
+
+    /// <summary>
+    /// Force invalidates the cache for a specific URL
+    /// </summary>
+    [HttpDelete("link/cache")]
+    [Authorize]
+    [RequiredPermission("maintenance", "cache.scrap")]
+    public async Task<IActionResult> InvalidateCache([FromQuery] string url)
+    {
+        if (string.IsNullOrEmpty(url))
+        {
+            return BadRequest(new { error = "URL parameter is required" });
+        }
+
+        await reader.InvalidateCacheForUrlAsync(url);
+        return Ok(new { message = "Cache invalidated for URL" });
+    }
+
+    /// <summary>
+    /// Force invalidates all cached link previews
+    /// </summary>
+    [HttpDelete("cache/all")]
+    [Authorize]
+    [RequiredPermission("maintenance", "cache.scrap")]
+    public async Task<IActionResult> InvalidateAllCache()
+    {
+        await reader.InvalidateAllCachedPreviewsAsync();
+        return Ok(new { message = "All link preview caches invalidated" });
+    }
+}
+
+/// <summary>
+/// Helper class for URL decoding
+/// </summary>
+public static class UrlDecoder
+{
+    public static string Decode(string url)
+    {
+        // First check if URL is already decoded
+        if (!url.Contains('%') && !url.Contains('+'))
+        {   
+            return url;
+        }
+
+        try
+        {
+            return System.Net.WebUtility.UrlDecode(url);
+        }
+        catch
+        {
+            // If decoding fails, return the original string
+            return url;
+        }
+    }
+}
--- a/DysonNetwork.Sphere/WebReader/WebReaderException.cs
+++ b/DysonNetwork.Sphere/WebReader/WebReaderException.cs
@@ -0,0 +1,17 @@
+using System;
+
+namespace DysonNetwork.Sphere.WebReader;
+
+/// <summary>
+/// Exception thrown when an error occurs during web reading operations
+/// </summary>
+public class WebReaderException : Exception
+{
+    public WebReaderException(string message) : base(message)
+    {
+    }
+
+    public WebReaderException(string message, Exception innerException) : base(message, innerException)
+    {
+    }
+}
--- a/DysonNetwork.Sphere/WebReader/WebReaderService.cs
+++ b/DysonNetwork.Sphere/WebReader/WebReaderService.cs
@@ -0,0 +1,367 @@
+using System.Globalization;
+using AngleSharp;
+using AngleSharp.Dom;
+using DysonNetwork.Shared.Cache;
+using HtmlAgilityPack;
+
+namespace DysonNetwork.Sphere.WebReader;
+
+/// <summary>
+/// The service is amin to providing scrapping service to the Solar Network.
+/// Such as news feed, external articles and link preview.
+/// </summary>
+public class WebReaderService(
+    IHttpClientFactory httpClientFactory,
+    ILogger<WebReaderService> logger,
+    ICacheService cache
+)
+{
+    private const string LinkPreviewCachePrefix = "scrap:preview:";
+    private const string LinkPreviewCacheGroup = "scrap:preview";
+
+    public async Task<ScrapedArticle> ScrapeArticleAsync(string url, CancellationToken cancellationToken = default)
+    {
+        var linkEmbed = await GetLinkPreviewAsync(url, cancellationToken);
+        var content = await GetArticleContentAsync(url, cancellationToken);
+        return new ScrapedArticle
+        {
+            LinkEmbed = linkEmbed,
+            Content = content
+        };
+    }
+
+    private async Task<string?> GetArticleContentAsync(string url, CancellationToken cancellationToken)
+    {
+        var httpClient = httpClientFactory.CreateClient("WebReader");
+        var response = await httpClient.GetAsync(url, cancellationToken);
+        if (!response.IsSuccessStatusCode)
+        {
+            logger.LogWarning("Failed to scrap article content for URL: {Url}", url);
+            return null;
+        }
+
+        var html = await response.Content.ReadAsStringAsync(cancellationToken);
+        var doc = new HtmlDocument();
+        doc.LoadHtml(html);
+        var articleNode = doc.DocumentNode.SelectSingleNode("//article");
+        return articleNode?.InnerHtml;
+    }
+
+
+    /// <summary>
+    /// Generate a link preview embed from a URL
+    /// </summary>
+    /// <param name="url">The URL to generate the preview for</param>
+    /// <param name="cancellationToken">Cancellation token</param>
+    /// <param name="bypassCache">If true, bypass cache and fetch fresh data</param>
+    /// <param name="cacheExpiry">Custom cache expiration time</param>
+    /// <returns>A LinkEmbed object containing the preview data</returns>
+    public async Task<LinkEmbed> GetLinkPreviewAsync(
+        string url,
+        CancellationToken cancellationToken = default,
+        TimeSpan? cacheExpiry = null,
+        bool bypassCache = false
+    )
+    {
+        // Ensure URL is valid
+        if (!Uri.TryCreate(url, UriKind.Absolute, out var uri))
+        {
+            throw new ArgumentException(@"Invalid URL format", nameof(url));
+        }
+
+        // Try to get from cache if not bypassing
+        if (!bypassCache)
+        {
+            var cachedPreview = await GetCachedLinkPreview(url);
+            if (cachedPreview is not null)
+                return cachedPreview;
+        }
+
+        // Cache miss or bypass, fetch fresh data
+        logger.LogDebug("Fetching fresh link preview for URL: {Url}", url);
+        var httpClient = httpClientFactory.CreateClient("WebReader");
+        httpClient.MaxResponseContentBufferSize =
+            10 * 1024 * 1024; // 10MB, prevent scrap some directly accessible files
+        httpClient.Timeout = TimeSpan.FromSeconds(3);
+        // Setting UA to facebook's bot to get the opengraph.
+        httpClient.DefaultRequestHeaders.Add("User-Agent", "facebookexternalhit/1.1");
+
+        try
+        {
+            var response = await httpClient.GetAsync(url, cancellationToken);
+            response.EnsureSuccessStatusCode();
+
+            var contentType = response.Content.Headers.ContentType?.MediaType;
+            if (contentType == null || !contentType.StartsWith("text/html"))
+            {
+                logger.LogWarning("URL is not an HTML page: {Url}, ContentType: {ContentType}", url, contentType);
+                var nonHtmlEmbed = new LinkEmbed
+                {
+                    Url = url,
+                    Title = uri.Host,
+                    ContentType = contentType
+                };
+
+                // Cache non-HTML responses too
+                await CacheLinkPreview(nonHtmlEmbed, url, cacheExpiry);
+                return nonHtmlEmbed;
+            }
+
+            var html = await response.Content.ReadAsStringAsync(cancellationToken);
+            var linkEmbed = await ExtractLinkData(url, html, uri);
+
+            // Cache the result
+            await CacheLinkPreview(linkEmbed, url, cacheExpiry);
+
+            return linkEmbed;
+        }
+        catch (HttpRequestException ex)
+        {
+            logger.LogError(ex, "Failed to fetch URL: {Url}", url);
+            throw new WebReaderException($"Failed to fetch URL: {url}", ex);
+        }
+    }
+
+    private async Task<LinkEmbed> ExtractLinkData(string url, string html, Uri uri)
+    {
+        var embed = new LinkEmbed
+        {
+            Url = url
+        };
+
+        // Configure AngleSharp context
+        var config = Configuration.Default;
+        var context = BrowsingContext.New(config);
+        var document = await context.OpenAsync(req => req.Content(html));
+
+        // Extract OpenGraph tags
+        var ogTitle = GetMetaTagContent(document, "og:title");
+        var ogDescription = GetMetaTagContent(document, "og:description");
+        var ogImage = GetMetaTagContent(document, "og:image");
+        var ogSiteName = GetMetaTagContent(document, "og:site_name");
+        var ogType = GetMetaTagContent(document, "og:type");
+
+        // Extract Twitter card tags as fallback
+        var twitterTitle = GetMetaTagContent(document, "twitter:title");
+        var twitterDescription = GetMetaTagContent(document, "twitter:description");
+        var twitterImage = GetMetaTagContent(document, "twitter:image");
+
+        // Extract standard meta tags as final fallback
+        var metaTitle = GetMetaTagContent(document, "title") ??
+                        GetMetaContent(document, "title");
+        var metaDescription = GetMetaTagContent(document, "description");
+
+        // Extract page title
+        var pageTitle = document.Title?.Trim();
+
+        // Extract publish date
+        var publishedTime = GetMetaTagContent(document, "article:published_time") ??
+                            GetMetaTagContent(document, "datePublished") ??
+                            GetMetaTagContent(document, "pubdate");
+
+        // Extract author
+        var author = GetMetaTagContent(document, "author") ??
+                     GetMetaTagContent(document, "article:author");
+
+        // Extract favicon
+        var faviconUrl = GetFaviconUrl(document, uri);
+
+        // Populate the embed with the data, prioritizing OpenGraph
+        embed.Title = ogTitle ?? twitterTitle ?? metaTitle ?? pageTitle ?? uri.Host;
+        embed.Description = ogDescription ?? twitterDescription ?? metaDescription;
+        embed.ImageUrl = ResolveRelativeUrl(ogImage ?? twitterImage, uri);
+        embed.SiteName = ogSiteName ?? uri.Host;
+        embed.ContentType = ogType;
+        embed.FaviconUrl = faviconUrl;
+        embed.Author = author;
+
+        // Parse and set published date
+        if (!string.IsNullOrEmpty(publishedTime) &&
+            DateTime.TryParse(publishedTime, CultureInfo.InvariantCulture, DateTimeStyles.AdjustToUniversal,
+                out DateTime parsedDate))
+        {
+            embed.PublishedDate = parsedDate;
+        }
+
+        return embed;
+    }
+
+    private static string? GetMetaTagContent(IDocument doc, string property)
+    {
+        // Check for OpenGraph/Twitter style meta tags
+        var node = doc.QuerySelector($"meta[property='{property}'][content]")
+                   ?? doc.QuerySelector($"meta[name='{property}'][content]");
+
+        return node?.GetAttribute("content")?.Trim();
+    }
+
+    private static string? GetMetaContent(IDocument doc, string name)
+    {
+        var node = doc.QuerySelector($"meta[name='{name}'][content]");
+        return node?.GetAttribute("content")?.Trim();
+    }
+
+    private static string? GetFaviconUrl(IDocument doc, Uri baseUri)
+    {
+        // Look for apple-touch-icon first as it's typically higher quality
+        var appleIconNode = doc.QuerySelector("link[rel='apple-touch-icon'][href]");
+        if (appleIconNode != null)
+        {
+            return ResolveRelativeUrl(appleIconNode.GetAttribute("href"), baseUri);
+        }
+
+        // Then check for standard favicon
+        var faviconNode = doc.QuerySelector("link[rel='icon'][href]") ??
+                          doc.QuerySelector("link[rel='shortcut icon'][href]");
+
+        return faviconNode != null
+            ? ResolveRelativeUrl(faviconNode.GetAttribute("href"), baseUri)
+            : new Uri(baseUri, "/favicon.ico").ToString();
+    }
+
+    private static string? ResolveRelativeUrl(string? url, Uri baseUri)
+    {
+        if (string.IsNullOrEmpty(url))
+        {
+            return null;
+        }
+
+        if (Uri.TryCreate(url, UriKind.Absolute, out _))
+        {
+            return url; // Already absolute
+        }
+
+        return Uri.TryCreate(baseUri, url, out var absoluteUri) ? absoluteUri.ToString() : null;
+    }
+
+    /// <summary>
+    /// Generate a hash-based cache key for a URL
+    /// </summary>
+    private string GenerateUrlCacheKey(string url)
+    {
+        // Normalize the URL first
+        var normalizedUrl = NormalizeUrl(url);
+
+        // Create SHA256 hash of the normalized URL
+        using var sha256 = System.Security.Cryptography.SHA256.Create();
+        var urlBytes = System.Text.Encoding.UTF8.GetBytes(normalizedUrl);
+        var hashBytes = sha256.ComputeHash(urlBytes);
+
+        // Convert to hex string
+        var hashString = BitConverter.ToString(hashBytes).Replace("-", "").ToLowerInvariant();
+
+        // Return prefixed key
+        return $"{LinkPreviewCachePrefix}{hashString}";
+    }
+
+    /// <summary>
+    /// Normalize URL by trimming trailing slashes but preserving query parameters
+    /// </summary>
+    private string NormalizeUrl(string url)
+    {
+        if (string.IsNullOrEmpty(url))
+            return string.Empty;
+
+        // First ensure we have a valid URI
+        if (!Uri.TryCreate(url, UriKind.Absolute, out var uri))
+            return url.TrimEnd('/');
+
+        // Rebuild the URL without trailing slashes but with query parameters
+        var scheme = uri.Scheme;
+        var host = uri.Host;
+        var port = uri.IsDefaultPort ? string.Empty : $":{uri.Port}";
+        var path = uri.AbsolutePath.TrimEnd('/');
+        var query = uri.Query;
+
+        return $"{scheme}://{host}{port}{path}{query}".ToLowerInvariant();
+    }
+
+    /// <summary>
+    /// Cache a link preview
+    /// </summary>
+    private async Task CacheLinkPreview(LinkEmbed? linkEmbed, string url, TimeSpan? expiry = null)
+    {
+        if (linkEmbed == null || string.IsNullOrEmpty(url))
+            return;
+
+        try
+        {
+            var cacheKey = GenerateUrlCacheKey(url);
+            var expiryTime = expiry ?? TimeSpan.FromHours(24);
+
+            await cache.SetWithGroupsAsync(
+                cacheKey,
+                linkEmbed,
+                [LinkPreviewCacheGroup],
+                expiryTime);
+
+            logger.LogDebug("Cached link preview for URL: {Url} with key: {CacheKey}", url, cacheKey);
+        }
+        catch (Exception ex)
+        {
+            // Log but don't throw - caching failures shouldn't break the main functionality
+            logger.LogWarning(ex, "Failed to cache link preview for URL: {Url}", url);
+        }
+    }
+
+    /// <summary>
+    /// Try to get a cached link preview
+    /// </summary>
+    private async Task<LinkEmbed?> GetCachedLinkPreview(string url)
+    {
+        if (string.IsNullOrEmpty(url))
+            return null;
+
+        try
+        {
+            var cacheKey = GenerateUrlCacheKey(url);
+            var cachedPreview = await cache.GetAsync<LinkEmbed>(cacheKey);
+
+            if (cachedPreview is not null)
+                logger.LogDebug("Retrieved cached link preview for URL: {Url}", url);
+
+            return cachedPreview;
+        }
+        catch (Exception ex)
+        {
+            logger.LogWarning(ex, "Failed to retrieve cached link preview for URL: {Url}", url);
+            return null;
+        }
+    }
+
+    /// <summary>
+    /// Invalidate cache for a specific URL
+    /// </summary>
+    public async Task InvalidateCacheForUrlAsync(string url)
+    {
+        if (string.IsNullOrEmpty(url))
+            return;
+
+        try
+        {
+            var cacheKey = GenerateUrlCacheKey(url);
+            await cache.RemoveAsync(cacheKey);
+            logger.LogDebug("Invalidated cache for URL: {Url} with key: {CacheKey}", url, cacheKey);
+        }
+        catch (Exception ex)
+        {
+            logger.LogWarning(ex, "Failed to invalidate cache for URL: {Url}", url);
+        }
+    }
+
+    /// <summary>
+    /// Invalidate all cached link previews
+    /// </summary>
+    public async Task InvalidateAllCachedPreviewsAsync()
+    {
+        try
+        {
+            await cache.RemoveGroupAsync(LinkPreviewCacheGroup);
+            logger.LogInformation("Invalidated all cached link previews");
+        }
+        catch (Exception ex)
+        {
+            logger.LogWarning(ex, "Failed to invalidate all cached link previews");
+        }
+    }
+}