♻️ I have no idea what am I doing. Might be mixing stuff

This commit is contained in:
2025-07-14 19:55:28 +08:00
parent ef9175d27d
commit cbfdb4aa60
232 changed files with 990 additions and 115807 deletions

View File

@@ -0,0 +1,44 @@
using System.Reflection;
using System.Text.Json.Serialization;
namespace DysonNetwork.Sphere.WebReader;
/// <summary>
/// The embeddable can be used in the post or messages' meta's embeds fields
/// To render a richer type of content.
///
/// A simple example of using link preview embed:
/// <code>
/// {
/// // ... post content
/// "meta": {
/// "embeds": [
/// {
/// "type": "link",
/// "title: "...",
/// /// ...
/// }
/// ]
/// }
/// }
/// </code>
/// </summary>
public abstract class EmbeddableBase
{
public abstract string Type { get; }
public Dictionary<string, object> ToDictionary()
{
var dict = new Dictionary<string, object>();
foreach (var prop in GetType().GetProperties())
{
if (prop.GetCustomAttribute<JsonIgnoreAttribute>() is not null)
continue;
var value = prop.GetValue(this);
if (value is null) continue;
dict[prop.Name] = value;
}
return dict;
}
}

View File

@@ -0,0 +1,55 @@
namespace DysonNetwork.Sphere.WebReader;
/// <summary>
/// The link embed is a part of the embeddable implementations
/// It can be used in the post or messages' meta's embeds fields
/// </summary>
public class LinkEmbed : EmbeddableBase
{
public override string Type => "link";
/// <summary>
/// The original URL that was processed
/// </summary>
public required string Url { get; set; }
/// <summary>
/// Title of the linked content (from OpenGraph og:title, meta title, or page title)
/// </summary>
public string? Title { get; set; }
/// <summary>
/// Description of the linked content (from OpenGraph og:description or meta description)
/// </summary>
public string? Description { get; set; }
/// <summary>
/// URL to the thumbnail image (from OpenGraph og:image or other meta tags)
/// </summary>
public string? ImageUrl { get; set; }
/// <summary>
/// The favicon URL of the site
/// </summary>
public string? FaviconUrl { get; set; }
/// <summary>
/// The site name (from OpenGraph og:site_name)
/// </summary>
public string? SiteName { get; set; }
/// <summary>
/// Type of the content (from OpenGraph og:type)
/// </summary>
public string? ContentType { get; set; }
/// <summary>
/// Author of the content if available
/// </summary>
public string? Author { get; set; }
/// <summary>
/// Published date of the content if available
/// </summary>
public DateTime? PublishedDate { get; set; }
}

View File

@@ -0,0 +1,7 @@
namespace DysonNetwork.Sphere.WebReader;
public class ScrapedArticle
{
public LinkEmbed LinkEmbed { get; set; } = null!;
public string? Content { get; set; }
}

View File

@@ -0,0 +1,46 @@
using System.ComponentModel.DataAnnotations;
using System.ComponentModel.DataAnnotations.Schema;
using System.Text.Json.Serialization;
namespace DysonNetwork.Sphere.WebReader;
public class WebArticle : ModelBase
{
public Guid Id { get; set; } = Guid.NewGuid();
[MaxLength(4096)] public string Title { get; set; } = null!;
[MaxLength(8192)] public string Url { get; set; } = null!;
[MaxLength(4096)] public string? Author { get; set; }
[Column(TypeName = "jsonb")] public Dictionary<string, object>? Meta { get; set; }
[Column(TypeName = "jsonb")] public LinkEmbed? Preview { get; set; }
// ReSharper disable once EntityFramework.ModelValidation.UnlimitedStringLength
public string? Content { get; set; }
public DateTime? PublishedAt { get; set; }
public Guid FeedId { get; set; }
public WebFeed Feed { get; set; } = null!;
}
public class WebFeedConfig
{
public bool ScrapPage { get; set; }
}
public class WebFeed : ModelBase
{
public Guid Id { get; set; } = Guid.NewGuid();
[MaxLength(8192)] public string Url { get; set; } = null!;
[MaxLength(4096)] public string Title { get; set; } = null!;
[MaxLength(8192)] public string? Description { get; set; }
[Column(TypeName = "jsonb")] public LinkEmbed? Preview { get; set; }
[Column(TypeName = "jsonb")] public WebFeedConfig Config { get; set; } = new();
public Guid PublisherId { get; set; }
public Publisher.Publisher Publisher { get; set; } = null!;
[JsonIgnore] public ICollection<WebArticle> Articles { get; set; } = new List<WebArticle>();
}

View File

@@ -0,0 +1,82 @@
using Microsoft.AspNetCore.Mvc;
using Microsoft.EntityFrameworkCore;
namespace DysonNetwork.Sphere.WebReader;
[ApiController]
[Route("/api/feeds/articles")]
public class WebArticleController(AppDatabase db) : ControllerBase
{
/// <summary>
/// Get a list of recent web articles
/// </summary>
/// <param name="limit">Maximum number of articles to return</param>
/// <param name="offset">Number of articles to skip</param>
/// <param name="feedId">Optional feed ID to filter by</param>
/// <param name="publisherId">Optional publisher ID to filter by</param>
/// <returns>List of web articles</returns>
[HttpGet]
public async Task<IActionResult> GetArticles(
[FromQuery] int limit = 20,
[FromQuery] int offset = 0,
[FromQuery] Guid? feedId = null,
[FromQuery] Guid? publisherId = null
)
{
var query = db.WebArticles
.OrderByDescending(a => a.PublishedAt)
.Include(a => a.Feed)
.AsQueryable();
if (feedId.HasValue)
query = query.Where(a => a.FeedId == feedId.Value);
if (publisherId.HasValue)
query = query.Where(a => a.Feed.PublisherId == publisherId.Value);
var totalCount = await query.CountAsync();
var articles = await query
.Skip(offset)
.Take(limit)
.ToListAsync();
Response.Headers["X-Total"] = totalCount.ToString();
return Ok(articles);
}
/// <summary>
/// Get a specific web article by ID
/// </summary>
/// <param name="id">The article ID</param>
/// <returns>The web article</returns>
[HttpGet("{id:guid}")]
[ProducesResponseType(404)]
public async Task<IActionResult> GetArticle(Guid id)
{
var article = await db.WebArticles
.Include(a => a.Feed)
.FirstOrDefaultAsync(a => a.Id == id);
if (article == null)
return NotFound();
return Ok(article);
}
/// <summary>
/// Get random web articles
/// </summary>
/// <param name="limit">Maximum number of articles to return</param>
/// <returns>List of random web articles</returns>
[HttpGet("random")]
public async Task<IActionResult> GetRandomArticles([FromQuery] int limit = 5)
{
var articles = await db.WebArticles
.OrderBy(_ => EF.Functions.Random())
.Include(a => a.Feed)
.Take(limit)
.ToListAsync();
return Ok(articles);
}
}

View File

@@ -0,0 +1,129 @@
using System.ComponentModel.DataAnnotations;
using DysonNetwork.Shared.Proto;
using DysonNetwork.Sphere.Publisher;
using Microsoft.AspNetCore.Authorization;
using Microsoft.AspNetCore.Mvc;
namespace DysonNetwork.Sphere.WebReader;
[Authorize]
[ApiController]
[Route("/api/publishers/{pubName}/feeds")]
public class WebFeedController(WebFeedService webFeed, PublisherService ps) : ControllerBase
{
public record WebFeedRequest(
[MaxLength(8192)] string? Url,
[MaxLength(4096)] string? Title,
[MaxLength(8192)] string? Description,
WebFeedConfig? Config
);
[HttpGet]
public async Task<IActionResult> ListFeeds([FromRoute] string pubName)
{
var publisher = await ps.GetPublisherByName(pubName);
if (publisher is null) return NotFound();
var feeds = await webFeed.GetFeedsByPublisherAsync(publisher.Id);
return Ok(feeds);
}
[HttpGet("{id:guid}")]
public async Task<IActionResult> GetFeed([FromRoute] string pubName, Guid id)
{
var publisher = await ps.GetPublisherByName(pubName);
if (publisher is null) return NotFound();
var feed = await webFeed.GetFeedAsync(id, publisherId: publisher.Id);
if (feed == null)
return NotFound();
return Ok(feed);
}
[HttpPost]
[Authorize]
public async Task<IActionResult> CreateWebFeed([FromRoute] string pubName, [FromBody] WebFeedRequest request)
{
if (HttpContext.Items["CurrentUser"] is not Account currentUser) return Unauthorized();
if (string.IsNullOrWhiteSpace(request.Url) || string.IsNullOrWhiteSpace(request.Title))
return BadRequest("Url and title are required");
var publisher = await ps.GetPublisherByName(pubName);
if (publisher is null) return NotFound();
var accountId = Guid.Parse(currentUser.Id);
if (!await ps.IsMemberWithRole(publisher.Id, accountId, PublisherMemberRole.Editor))
return StatusCode(403, "You must be an editor of the publisher to create a web feed");
var feed = await webFeed.CreateWebFeedAsync(publisher, request);
return Ok(feed);
}
[HttpPatch("{id:guid}")]
[Authorize]
public async Task<IActionResult> UpdateFeed([FromRoute] string pubName, Guid id, [FromBody] WebFeedRequest request)
{
if (HttpContext.Items["CurrentUser"] is not Account currentUser) return Unauthorized();
var publisher = await ps.GetPublisherByName(pubName);
if (publisher is null) return NotFound();
var accountId = Guid.Parse(currentUser.Id);
if (!await ps.IsMemberWithRole(publisher.Id, accountId, PublisherMemberRole.Editor))
return StatusCode(403, "You must be an editor of the publisher to update a web feed");
var feed = await webFeed.GetFeedAsync(id, publisherId: publisher.Id);
if (feed == null)
return NotFound();
feed = await webFeed.UpdateFeedAsync(feed, request);
return Ok(feed);
}
[HttpDelete("{id:guid}")]
[Authorize]
public async Task<IActionResult> DeleteFeed([FromRoute] string pubName, Guid id)
{
if (HttpContext.Items["CurrentUser"] is not Account currentUser) return Unauthorized();
var publisher = await ps.GetPublisherByName(pubName);
if (publisher is null) return NotFound();
var accountId = Guid.Parse(currentUser.Id);
if (!await ps.IsMemberWithRole(publisher.Id, accountId, PublisherMemberRole.Editor))
return StatusCode(403, "You must be an editor of the publisher to delete a web feed");
var feed = await webFeed.GetFeedAsync(id, publisherId: publisher.Id);
if (feed == null)
return NotFound();
var result = await webFeed.DeleteFeedAsync(id);
if (!result)
return NotFound();
return NoContent();
}
[HttpPost("{id:guid}/scrap")]
[Authorize]
public async Task<ActionResult> Scrap([FromRoute] string pubName, Guid id)
{
if (HttpContext.Items["CurrentUser"] is not Account currentUser) return Unauthorized();
var publisher = await ps.GetPublisherByName(pubName);
if (publisher is null) return NotFound();
var accountId = Guid.Parse(currentUser.Id);
if (!await ps.IsMemberWithRole(publisher.Id, accountId, PublisherMemberRole.Editor))
return StatusCode(403, "You must be an editor of the publisher to scrape a web feed");
var feed = await webFeed.GetFeedAsync(id, publisherId: publisher.Id);
if (feed == null)
{
return NotFound();
}
await webFeed.ScrapeFeedAsync(feed);
return Ok();
}
}

View File

@@ -0,0 +1,35 @@
using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.Logging;
using Quartz;
namespace DysonNetwork.Sphere.WebReader;
[DisallowConcurrentExecution]
public class WebFeedScraperJob(
AppDatabase database,
WebFeedService webFeedService,
ILogger<WebFeedScraperJob> logger
)
: IJob
{
public async Task Execute(IJobExecutionContext context)
{
logger.LogInformation("Starting web feed scraper job.");
var feeds = await database.Set<WebFeed>().ToListAsync(context.CancellationToken);
foreach (var feed in feeds)
{
try
{
await webFeedService.ScrapeFeedAsync(feed, context.CancellationToken);
}
catch (Exception ex)
{
logger.LogError(ex, "Failed to scrape web feed {FeedId}", feed.Id);
}
}
logger.LogInformation("Web feed scraper job finished.");
}
}

View File

@@ -0,0 +1,135 @@
using System.ServiceModel.Syndication;
using System.Xml;
using Microsoft.EntityFrameworkCore;
namespace DysonNetwork.Sphere.WebReader;
public class WebFeedService(
AppDatabase database,
IHttpClientFactory httpClientFactory,
ILogger<WebFeedService> logger,
WebReaderService webReaderService
)
{
public async Task<WebFeed> CreateWebFeedAsync(Publisher.Publisher publisher,
WebFeedController.WebFeedRequest request)
{
var feed = new WebFeed
{
Url = request.Url!,
Title = request.Title!,
Description = request.Description,
Config = request.Config ?? new WebFeedConfig(),
PublisherId = publisher.Id,
};
database.Set<WebFeed>().Add(feed);
await database.SaveChangesAsync();
return feed;
}
public async Task<WebFeed?> GetFeedAsync(Guid id, Guid? publisherId = null)
{
var query = database.WebFeeds.Where(a => a.Id == id).AsQueryable();
if (publisherId.HasValue)
query = query.Where(a => a.PublisherId == publisherId.Value);
return await query.FirstOrDefaultAsync();
}
public async Task<List<WebFeed>> GetFeedsByPublisherAsync(Guid publisherId)
{
return await database.WebFeeds.Where(a => a.PublisherId == publisherId).ToListAsync();
}
public async Task<WebFeed> UpdateFeedAsync(WebFeed feed, WebFeedController.WebFeedRequest request)
{
if (request.Url is not null)
feed.Url = request.Url;
if (request.Title is not null)
feed.Title = request.Title;
if (request.Description is not null)
feed.Description = request.Description;
if (request.Config is not null)
feed.Config = request.Config;
database.Update(feed);
await database.SaveChangesAsync();
return feed;
}
public async Task<bool> DeleteFeedAsync(Guid id)
{
var feed = await database.WebFeeds.FindAsync(id);
if (feed == null)
{
return false;
}
database.WebFeeds.Remove(feed);
await database.SaveChangesAsync();
return true;
}
public async Task ScrapeFeedAsync(WebFeed feed, CancellationToken cancellationToken = default)
{
var httpClient = httpClientFactory.CreateClient();
var response = await httpClient.GetAsync(feed.Url, cancellationToken);
response.EnsureSuccessStatusCode();
await using var stream = await response.Content.ReadAsStreamAsync(cancellationToken);
using var reader = XmlReader.Create(stream);
var syndicationFeed = SyndicationFeed.Load(reader);
if (syndicationFeed == null)
{
logger.LogWarning("Could not parse syndication feed for {FeedUrl}", feed.Url);
return;
}
foreach (var item in syndicationFeed.Items)
{
var itemUrl = item.Links.FirstOrDefault()?.Uri.ToString();
if (string.IsNullOrEmpty(itemUrl))
continue;
var articleExists = await database.Set<WebArticle>()
.AnyAsync(a => a.FeedId == feed.Id && a.Url == itemUrl, cancellationToken);
if (articleExists)
continue;
var content = (item.Content as TextSyndicationContent)?.Text ?? item.Summary.Text;
LinkEmbed preview;
if (feed.Config.ScrapPage)
{
var scrapedArticle = await webReaderService.ScrapeArticleAsync(itemUrl, cancellationToken);
preview = scrapedArticle.LinkEmbed;
if (scrapedArticle.Content is not null)
content = scrapedArticle.Content;
}
else
{
preview = await webReaderService.GetLinkPreviewAsync(itemUrl, cancellationToken);
}
var newArticle = new WebArticle
{
FeedId = feed.Id,
Title = item.Title.Text,
Url = itemUrl,
Author = item.Authors.FirstOrDefault()?.Name,
Content = content,
PublishedAt = item.LastUpdatedTime.UtcDateTime,
Preview = preview,
};
database.WebArticles.Add(newArticle);
}
await database.SaveChangesAsync(cancellationToken);
}
}

View File

@@ -0,0 +1,110 @@
using DysonNetwork.Sphere.Permission;
using Microsoft.AspNetCore.Authorization;
using Microsoft.AspNetCore.Mvc;
using Microsoft.AspNetCore.RateLimiting;
namespace DysonNetwork.Sphere.WebReader;
/// <summary>
/// Controller for web scraping and link preview services
/// </summary>
[ApiController]
[Route("/api/scrap")]
[EnableRateLimiting("fixed")]
public class WebReaderController(WebReaderService reader, ILogger<WebReaderController> logger)
: ControllerBase
{
/// <summary>
/// Retrieves a preview for the provided URL
/// </summary>
/// <param name="url">URL-encoded link to generate preview for</param>
/// <returns>Link preview data including title, description, and image</returns>
[HttpGet("link")]
public async Task<ActionResult<LinkEmbed>> ScrapLink([FromQuery] string url)
{
if (string.IsNullOrEmpty(url))
{
return BadRequest(new { error = "URL parameter is required" });
}
try
{
// Ensure URL is properly decoded
var decodedUrl = UrlDecoder.Decode(url);
// Validate URL format
if (!Uri.TryCreate(decodedUrl, UriKind.Absolute, out _))
{
return BadRequest(new { error = "Invalid URL format" });
}
var linkEmbed = await reader.GetLinkPreviewAsync(decodedUrl);
return Ok(linkEmbed);
}
catch (WebReaderException ex)
{
logger.LogWarning(ex, "Error scraping link: {Url}", url);
return BadRequest(new { error = ex.Message });
}
catch (Exception ex)
{
logger.LogError(ex, "Unexpected error scraping link: {Url}", url);
return StatusCode(StatusCodes.Status500InternalServerError,
new { error = "An unexpected error occurred while processing the link" });
}
}
/// <summary>
/// Force invalidates the cache for a specific URL
/// </summary>
[HttpDelete("link/cache")]
[Authorize]
[RequiredPermission("maintenance", "cache.scrap")]
public async Task<IActionResult> InvalidateCache([FromQuery] string url)
{
if (string.IsNullOrEmpty(url))
{
return BadRequest(new { error = "URL parameter is required" });
}
await reader.InvalidateCacheForUrlAsync(url);
return Ok(new { message = "Cache invalidated for URL" });
}
/// <summary>
/// Force invalidates all cached link previews
/// </summary>
[HttpDelete("cache/all")]
[Authorize]
[RequiredPermission("maintenance", "cache.scrap")]
public async Task<IActionResult> InvalidateAllCache()
{
await reader.InvalidateAllCachedPreviewsAsync();
return Ok(new { message = "All link preview caches invalidated" });
}
}
/// <summary>
/// Helper class for URL decoding
/// </summary>
public static class UrlDecoder
{
public static string Decode(string url)
{
// First check if URL is already decoded
if (!url.Contains('%') && !url.Contains('+'))
{
return url;
}
try
{
return System.Net.WebUtility.UrlDecode(url);
}
catch
{
// If decoding fails, return the original string
return url;
}
}
}

View File

@@ -0,0 +1,17 @@
using System;
namespace DysonNetwork.Sphere.WebReader;
/// <summary>
/// Exception thrown when an error occurs during web reading operations
/// </summary>
public class WebReaderException : Exception
{
public WebReaderException(string message) : base(message)
{
}
public WebReaderException(string message, Exception innerException) : base(message, innerException)
{
}
}

View File

@@ -0,0 +1,367 @@
using System.Globalization;
using AngleSharp;
using AngleSharp.Dom;
using DysonNetwork.Shared.Cache;
using HtmlAgilityPack;
namespace DysonNetwork.Sphere.WebReader;
/// <summary>
/// The service is amin to providing scrapping service to the Solar Network.
/// Such as news feed, external articles and link preview.
/// </summary>
public class WebReaderService(
IHttpClientFactory httpClientFactory,
ILogger<WebReaderService> logger,
ICacheService cache
)
{
private const string LinkPreviewCachePrefix = "scrap:preview:";
private const string LinkPreviewCacheGroup = "scrap:preview";
public async Task<ScrapedArticle> ScrapeArticleAsync(string url, CancellationToken cancellationToken = default)
{
var linkEmbed = await GetLinkPreviewAsync(url, cancellationToken);
var content = await GetArticleContentAsync(url, cancellationToken);
return new ScrapedArticle
{
LinkEmbed = linkEmbed,
Content = content
};
}
private async Task<string?> GetArticleContentAsync(string url, CancellationToken cancellationToken)
{
var httpClient = httpClientFactory.CreateClient("WebReader");
var response = await httpClient.GetAsync(url, cancellationToken);
if (!response.IsSuccessStatusCode)
{
logger.LogWarning("Failed to scrap article content for URL: {Url}", url);
return null;
}
var html = await response.Content.ReadAsStringAsync(cancellationToken);
var doc = new HtmlDocument();
doc.LoadHtml(html);
var articleNode = doc.DocumentNode.SelectSingleNode("//article");
return articleNode?.InnerHtml;
}
/// <summary>
/// Generate a link preview embed from a URL
/// </summary>
/// <param name="url">The URL to generate the preview for</param>
/// <param name="cancellationToken">Cancellation token</param>
/// <param name="bypassCache">If true, bypass cache and fetch fresh data</param>
/// <param name="cacheExpiry">Custom cache expiration time</param>
/// <returns>A LinkEmbed object containing the preview data</returns>
public async Task<LinkEmbed> GetLinkPreviewAsync(
string url,
CancellationToken cancellationToken = default,
TimeSpan? cacheExpiry = null,
bool bypassCache = false
)
{
// Ensure URL is valid
if (!Uri.TryCreate(url, UriKind.Absolute, out var uri))
{
throw new ArgumentException(@"Invalid URL format", nameof(url));
}
// Try to get from cache if not bypassing
if (!bypassCache)
{
var cachedPreview = await GetCachedLinkPreview(url);
if (cachedPreview is not null)
return cachedPreview;
}
// Cache miss or bypass, fetch fresh data
logger.LogDebug("Fetching fresh link preview for URL: {Url}", url);
var httpClient = httpClientFactory.CreateClient("WebReader");
httpClient.MaxResponseContentBufferSize =
10 * 1024 * 1024; // 10MB, prevent scrap some directly accessible files
httpClient.Timeout = TimeSpan.FromSeconds(3);
// Setting UA to facebook's bot to get the opengraph.
httpClient.DefaultRequestHeaders.Add("User-Agent", "facebookexternalhit/1.1");
try
{
var response = await httpClient.GetAsync(url, cancellationToken);
response.EnsureSuccessStatusCode();
var contentType = response.Content.Headers.ContentType?.MediaType;
if (contentType == null || !contentType.StartsWith("text/html"))
{
logger.LogWarning("URL is not an HTML page: {Url}, ContentType: {ContentType}", url, contentType);
var nonHtmlEmbed = new LinkEmbed
{
Url = url,
Title = uri.Host,
ContentType = contentType
};
// Cache non-HTML responses too
await CacheLinkPreview(nonHtmlEmbed, url, cacheExpiry);
return nonHtmlEmbed;
}
var html = await response.Content.ReadAsStringAsync(cancellationToken);
var linkEmbed = await ExtractLinkData(url, html, uri);
// Cache the result
await CacheLinkPreview(linkEmbed, url, cacheExpiry);
return linkEmbed;
}
catch (HttpRequestException ex)
{
logger.LogError(ex, "Failed to fetch URL: {Url}", url);
throw new WebReaderException($"Failed to fetch URL: {url}", ex);
}
}
private async Task<LinkEmbed> ExtractLinkData(string url, string html, Uri uri)
{
var embed = new LinkEmbed
{
Url = url
};
// Configure AngleSharp context
var config = Configuration.Default;
var context = BrowsingContext.New(config);
var document = await context.OpenAsync(req => req.Content(html));
// Extract OpenGraph tags
var ogTitle = GetMetaTagContent(document, "og:title");
var ogDescription = GetMetaTagContent(document, "og:description");
var ogImage = GetMetaTagContent(document, "og:image");
var ogSiteName = GetMetaTagContent(document, "og:site_name");
var ogType = GetMetaTagContent(document, "og:type");
// Extract Twitter card tags as fallback
var twitterTitle = GetMetaTagContent(document, "twitter:title");
var twitterDescription = GetMetaTagContent(document, "twitter:description");
var twitterImage = GetMetaTagContent(document, "twitter:image");
// Extract standard meta tags as final fallback
var metaTitle = GetMetaTagContent(document, "title") ??
GetMetaContent(document, "title");
var metaDescription = GetMetaTagContent(document, "description");
// Extract page title
var pageTitle = document.Title?.Trim();
// Extract publish date
var publishedTime = GetMetaTagContent(document, "article:published_time") ??
GetMetaTagContent(document, "datePublished") ??
GetMetaTagContent(document, "pubdate");
// Extract author
var author = GetMetaTagContent(document, "author") ??
GetMetaTagContent(document, "article:author");
// Extract favicon
var faviconUrl = GetFaviconUrl(document, uri);
// Populate the embed with the data, prioritizing OpenGraph
embed.Title = ogTitle ?? twitterTitle ?? metaTitle ?? pageTitle ?? uri.Host;
embed.Description = ogDescription ?? twitterDescription ?? metaDescription;
embed.ImageUrl = ResolveRelativeUrl(ogImage ?? twitterImage, uri);
embed.SiteName = ogSiteName ?? uri.Host;
embed.ContentType = ogType;
embed.FaviconUrl = faviconUrl;
embed.Author = author;
// Parse and set published date
if (!string.IsNullOrEmpty(publishedTime) &&
DateTime.TryParse(publishedTime, CultureInfo.InvariantCulture, DateTimeStyles.AdjustToUniversal,
out DateTime parsedDate))
{
embed.PublishedDate = parsedDate;
}
return embed;
}
private static string? GetMetaTagContent(IDocument doc, string property)
{
// Check for OpenGraph/Twitter style meta tags
var node = doc.QuerySelector($"meta[property='{property}'][content]")
?? doc.QuerySelector($"meta[name='{property}'][content]");
return node?.GetAttribute("content")?.Trim();
}
private static string? GetMetaContent(IDocument doc, string name)
{
var node = doc.QuerySelector($"meta[name='{name}'][content]");
return node?.GetAttribute("content")?.Trim();
}
private static string? GetFaviconUrl(IDocument doc, Uri baseUri)
{
// Look for apple-touch-icon first as it's typically higher quality
var appleIconNode = doc.QuerySelector("link[rel='apple-touch-icon'][href]");
if (appleIconNode != null)
{
return ResolveRelativeUrl(appleIconNode.GetAttribute("href"), baseUri);
}
// Then check for standard favicon
var faviconNode = doc.QuerySelector("link[rel='icon'][href]") ??
doc.QuerySelector("link[rel='shortcut icon'][href]");
return faviconNode != null
? ResolveRelativeUrl(faviconNode.GetAttribute("href"), baseUri)
: new Uri(baseUri, "/favicon.ico").ToString();
}
private static string? ResolveRelativeUrl(string? url, Uri baseUri)
{
if (string.IsNullOrEmpty(url))
{
return null;
}
if (Uri.TryCreate(url, UriKind.Absolute, out _))
{
return url; // Already absolute
}
return Uri.TryCreate(baseUri, url, out var absoluteUri) ? absoluteUri.ToString() : null;
}
/// <summary>
/// Generate a hash-based cache key for a URL
/// </summary>
private string GenerateUrlCacheKey(string url)
{
// Normalize the URL first
var normalizedUrl = NormalizeUrl(url);
// Create SHA256 hash of the normalized URL
using var sha256 = System.Security.Cryptography.SHA256.Create();
var urlBytes = System.Text.Encoding.UTF8.GetBytes(normalizedUrl);
var hashBytes = sha256.ComputeHash(urlBytes);
// Convert to hex string
var hashString = BitConverter.ToString(hashBytes).Replace("-", "").ToLowerInvariant();
// Return prefixed key
return $"{LinkPreviewCachePrefix}{hashString}";
}
/// <summary>
/// Normalize URL by trimming trailing slashes but preserving query parameters
/// </summary>
private string NormalizeUrl(string url)
{
if (string.IsNullOrEmpty(url))
return string.Empty;
// First ensure we have a valid URI
if (!Uri.TryCreate(url, UriKind.Absolute, out var uri))
return url.TrimEnd('/');
// Rebuild the URL without trailing slashes but with query parameters
var scheme = uri.Scheme;
var host = uri.Host;
var port = uri.IsDefaultPort ? string.Empty : $":{uri.Port}";
var path = uri.AbsolutePath.TrimEnd('/');
var query = uri.Query;
return $"{scheme}://{host}{port}{path}{query}".ToLowerInvariant();
}
/// <summary>
/// Cache a link preview
/// </summary>
private async Task CacheLinkPreview(LinkEmbed? linkEmbed, string url, TimeSpan? expiry = null)
{
if (linkEmbed == null || string.IsNullOrEmpty(url))
return;
try
{
var cacheKey = GenerateUrlCacheKey(url);
var expiryTime = expiry ?? TimeSpan.FromHours(24);
await cache.SetWithGroupsAsync(
cacheKey,
linkEmbed,
[LinkPreviewCacheGroup],
expiryTime);
logger.LogDebug("Cached link preview for URL: {Url} with key: {CacheKey}", url, cacheKey);
}
catch (Exception ex)
{
// Log but don't throw - caching failures shouldn't break the main functionality
logger.LogWarning(ex, "Failed to cache link preview for URL: {Url}", url);
}
}
/// <summary>
/// Try to get a cached link preview
/// </summary>
private async Task<LinkEmbed?> GetCachedLinkPreview(string url)
{
if (string.IsNullOrEmpty(url))
return null;
try
{
var cacheKey = GenerateUrlCacheKey(url);
var cachedPreview = await cache.GetAsync<LinkEmbed>(cacheKey);
if (cachedPreview is not null)
logger.LogDebug("Retrieved cached link preview for URL: {Url}", url);
return cachedPreview;
}
catch (Exception ex)
{
logger.LogWarning(ex, "Failed to retrieve cached link preview for URL: {Url}", url);
return null;
}
}
/// <summary>
/// Invalidate cache for a specific URL
/// </summary>
public async Task InvalidateCacheForUrlAsync(string url)
{
if (string.IsNullOrEmpty(url))
return;
try
{
var cacheKey = GenerateUrlCacheKey(url);
await cache.RemoveAsync(cacheKey);
logger.LogDebug("Invalidated cache for URL: {Url} with key: {CacheKey}", url, cacheKey);
}
catch (Exception ex)
{
logger.LogWarning(ex, "Failed to invalidate cache for URL: {Url}", url);
}
}
/// <summary>
/// Invalidate all cached link previews
/// </summary>
public async Task InvalidateAllCachedPreviewsAsync()
{
try
{
await cache.RemoveGroupAsync(LinkPreviewCacheGroup);
logger.LogInformation("Invalidated all cached link previews");
}
catch (Exception ex)
{
logger.LogWarning(ex, "Failed to invalidate all cached link previews");
}
}
}