diff --git a/DysonNetwork.Sphere/Connection/WebReader/IEmbeddable.cs b/DysonNetwork.Sphere/Connection/WebReader/IEmbeddable.cs new file mode 100644 index 0000000..958f05c --- /dev/null +++ b/DysonNetwork.Sphere/Connection/WebReader/IEmbeddable.cs @@ -0,0 +1,26 @@ +namespace DysonNetwork.Sphere.Connection.WebReader; + +/// +/// The embeddable can be used in the post or messages' meta's embeds fields +/// To render richer type of content. +/// +/// A simple example of using link preview embed: +/// +/// { +/// // ... post content +/// "meta": { +/// "embeds": [ +/// { +/// "type": "link", +/// "title: "...", +/// /// ... +/// } +/// ] +/// } +/// } +/// +/// +public interface IEmbeddable +{ + public string Type { get; } +} \ No newline at end of file diff --git a/DysonNetwork.Sphere/Connection/WebReader/LinkEmbed.cs b/DysonNetwork.Sphere/Connection/WebReader/LinkEmbed.cs new file mode 100644 index 0000000..72e4849 --- /dev/null +++ b/DysonNetwork.Sphere/Connection/WebReader/LinkEmbed.cs @@ -0,0 +1,55 @@ +namespace DysonNetwork.Sphere.Connection.WebReader; + +/// +/// The link embed is a part of the embeddable implementations +/// It can be used in the post or messages' meta's embeds fields +/// +public class LinkEmbed : IEmbeddable +{ + public string Type => "link"; + + /// + /// The original URL that was processed + /// + public required string Url { get; set; } + + /// + /// Title of the linked content (from OpenGraph og:title, meta title, or page title) + /// + public string? Title { get; set; } + + /// + /// Description of the linked content (from OpenGraph og:description or meta description) + /// + public string? Description { get; set; } + + /// + /// URL to the thumbnail image (from OpenGraph og:image or other meta tags) + /// + public string? ImageUrl { get; set; } + + /// + /// The favicon URL of the site + /// + public string? FaviconUrl { get; set; } + + /// + /// The site name (from OpenGraph og:site_name) + /// + public string? SiteName { get; set; } + + /// + /// Type of the content (from OpenGraph og:type) + /// + public string? ContentType { get; set; } + + /// + /// Author of the content if available + /// + public string? Author { get; set; } + + /// + /// Published date of the content if available + /// + public DateTime? PublishedDate { get; set; } +} \ No newline at end of file diff --git a/DysonNetwork.Sphere/Connection/WebReader/WebReaderController.cs b/DysonNetwork.Sphere/Connection/WebReader/WebReaderController.cs new file mode 100644 index 0000000..c32bbb4 --- /dev/null +++ b/DysonNetwork.Sphere/Connection/WebReader/WebReaderController.cs @@ -0,0 +1,110 @@ +using DysonNetwork.Sphere.Permission; +using Microsoft.AspNetCore.Authorization; +using Microsoft.AspNetCore.Mvc; +using Microsoft.AspNetCore.RateLimiting; + +namespace DysonNetwork.Sphere.Connection.WebReader; + +/// +/// Controller for web scraping and link preview services +/// +[ApiController] +[Route("/scrap")] +[EnableRateLimiting("fixed")] +public class WebReaderController(WebReaderService reader, ILogger logger) + : ControllerBase +{ + /// + /// Retrieves a preview for the provided URL + /// + /// URL-encoded link to generate preview for + /// Link preview data including title, description, and image + [HttpGet("link")] + public async Task> ScrapLink([FromQuery] string url) + { + if (string.IsNullOrEmpty(url)) + { + return BadRequest(new { error = "URL parameter is required" }); + } + + try + { + // Ensure URL is properly decoded + var decodedUrl = UrlDecoder.Decode(url); + + // Validate URL format + if (!Uri.TryCreate(decodedUrl, UriKind.Absolute, out _)) + { + return BadRequest(new { error = "Invalid URL format" }); + } + + var linkEmbed = await reader.GetLinkPreviewAsync(decodedUrl); + return Ok(linkEmbed); + } + catch (WebReaderException ex) + { + logger.LogWarning(ex, "Error scraping link: {Url}", url); + return BadRequest(new { error = ex.Message }); + } + catch (Exception ex) + { + logger.LogError(ex, "Unexpected error scraping link: {Url}", url); + return StatusCode(StatusCodes.Status500InternalServerError, + new { error = "An unexpected error occurred while processing the link" }); + } + } + + /// + /// Force invalidates the cache for a specific URL + /// + [HttpDelete("link/cache")] + [Authorize] + [RequiredPermission("maintenance", "cache.scrap")] + public async Task InvalidateCache([FromQuery] string url) + { + if (string.IsNullOrEmpty(url)) + { + return BadRequest(new { error = "URL parameter is required" }); + } + + await reader.InvalidateCacheForUrlAsync(url); + return Ok(new { message = "Cache invalidated for URL" }); + } + + /// + /// Force invalidates all cached link previews + /// + [HttpDelete("cache/all")] + [Authorize] + [RequiredPermission("maintenance", "cache.scrap")] + public async Task InvalidateAllCache() + { + await reader.InvalidateAllCachedPreviewsAsync(); + return Ok(new { message = "All link preview caches invalidated" }); + } +} + +/// +/// Helper class for URL decoding +/// +public static class UrlDecoder +{ + public static string Decode(string url) + { + // First check if URL is already decoded + if (!url.Contains('%') && !url.Contains('+')) + { + return url; + } + + try + { + return System.Net.WebUtility.UrlDecode(url); + } + catch + { + // If decoding fails, return the original string + return url; + } + } +} \ No newline at end of file diff --git a/DysonNetwork.Sphere/Connection/WebReader/WebReaderException.cs b/DysonNetwork.Sphere/Connection/WebReader/WebReaderException.cs new file mode 100644 index 0000000..31b8032 --- /dev/null +++ b/DysonNetwork.Sphere/Connection/WebReader/WebReaderException.cs @@ -0,0 +1,17 @@ +using System; + +namespace DysonNetwork.Sphere.Connection.WebReader; + +/// +/// Exception thrown when an error occurs during web reading operations +/// +public class WebReaderException : Exception +{ + public WebReaderException(string message) : base(message) + { + } + + public WebReaderException(string message, Exception innerException) : base(message, innerException) + { + } +} diff --git a/DysonNetwork.Sphere/Connection/WebReader/WebReaderService.cs b/DysonNetwork.Sphere/Connection/WebReader/WebReaderService.cs new file mode 100644 index 0000000..b86b352 --- /dev/null +++ b/DysonNetwork.Sphere/Connection/WebReader/WebReaderService.cs @@ -0,0 +1,334 @@ +using System.Globalization; +using AngleSharp; +using AngleSharp.Dom; +using DysonNetwork.Sphere.Storage; + +namespace DysonNetwork.Sphere.Connection.WebReader; + +/// +/// The service is amin to providing scrapping service to the Solar Network. +/// Such as news feed, external articles and link preview. +/// +public class WebReaderService( + IHttpClientFactory httpClientFactory, + ILogger logger, + ICacheService cache) +{ + private const string LinkPreviewCachePrefix = "scrap:preview:"; + private const string LinkPreviewCacheGroup = "scrap:preview"; + + /// + /// Generate a link preview embed from a URL + /// + /// The URL to generate the preview for + /// Cancellation token + /// If true, bypass cache and fetch fresh data + /// Custom cache expiration time + /// A LinkEmbed object containing the preview data + public async Task GetLinkPreviewAsync( + string url, + CancellationToken cancellationToken = default, + TimeSpan? cacheExpiry = null, + bool bypassCache = false + ) + { + // Ensure URL is valid + if (!Uri.TryCreate(url, UriKind.Absolute, out var uri)) + { + throw new ArgumentException(@"Invalid URL format", nameof(url)); + } + + // Try to get from cache if not bypassing + if (!bypassCache) + { + var cachedPreview = await GetCachedLinkPreview(url); + if (cachedPreview is not null) + return cachedPreview; + } + + // Cache miss or bypass, fetch fresh data + logger.LogDebug("Fetching fresh link preview for URL: {Url}", url); + var httpClient = httpClientFactory.CreateClient("WebReader"); + httpClient.MaxResponseContentBufferSize = 10 * 1024 * 1024; // 10MB, prevent scrap some directly accessible files + httpClient.Timeout = TimeSpan.FromSeconds(3); + httpClient.DefaultRequestHeaders.Add("User-Agent", "DysonNetwork/1.0 LinkPreview Bot"); + + try + { + var response = await httpClient.GetAsync(url, cancellationToken); + response.EnsureSuccessStatusCode(); + + var contentType = response.Content.Headers.ContentType?.MediaType; + if (contentType == null || !contentType.StartsWith("text/html")) + { + logger.LogWarning("URL is not an HTML page: {Url}, ContentType: {ContentType}", url, contentType); + var nonHtmlEmbed = new LinkEmbed + { + Url = url, + Title = uri.Host, + ContentType = contentType + }; + + // Cache non-HTML responses too + await CacheLinkPreview(nonHtmlEmbed, url, cacheExpiry); + return nonHtmlEmbed; + } + + var html = await response.Content.ReadAsStringAsync(cancellationToken); + var linkEmbed = await ExtractLinkData(url, html, uri); + + // Cache the result + await CacheLinkPreview(linkEmbed, url, cacheExpiry); + + return linkEmbed; + } + catch (HttpRequestException ex) + { + logger.LogError(ex, "Failed to fetch URL: {Url}", url); + throw new WebReaderException($"Failed to fetch URL: {url}", ex); + } + } + + private async Task ExtractLinkData(string url, string html, Uri uri) + { + var embed = new LinkEmbed + { + Url = url + }; + + // Configure AngleSharp context + var config = Configuration.Default; + var context = BrowsingContext.New(config); + var document = await context.OpenAsync(req => req.Content(html)); + + // Extract OpenGraph tags + var ogTitle = GetMetaTagContent(document, "og:title"); + var ogDescription = GetMetaTagContent(document, "og:description"); + var ogImage = GetMetaTagContent(document, "og:image"); + var ogSiteName = GetMetaTagContent(document, "og:site_name"); + var ogType = GetMetaTagContent(document, "og:type"); + + // Extract Twitter card tags as fallback + var twitterTitle = GetMetaTagContent(document, "twitter:title"); + var twitterDescription = GetMetaTagContent(document, "twitter:description"); + var twitterImage = GetMetaTagContent(document, "twitter:image"); + + // Extract standard meta tags as final fallback + var metaTitle = GetMetaTagContent(document, "title") ?? + GetMetaContent(document, "title"); + var metaDescription = GetMetaTagContent(document, "description"); + + // Extract page title + var pageTitle = document.Title?.Trim(); + + // Extract publish date + var publishedTime = GetMetaTagContent(document, "article:published_time") ?? + GetMetaTagContent(document, "datePublished") ?? + GetMetaTagContent(document, "pubdate"); + + // Extract author + var author = GetMetaTagContent(document, "author") ?? + GetMetaTagContent(document, "article:author"); + + // Extract favicon + var faviconUrl = GetFaviconUrl(document, uri); + + // Populate the embed with the data, prioritizing OpenGraph + embed.Title = ogTitle ?? twitterTitle ?? metaTitle ?? pageTitle ?? uri.Host; + embed.Description = ogDescription ?? twitterDescription ?? metaDescription; + embed.ImageUrl = ResolveRelativeUrl(ogImage ?? twitterImage, uri); + embed.SiteName = ogSiteName ?? uri.Host; + embed.ContentType = ogType; + embed.FaviconUrl = faviconUrl; + embed.Author = author; + + // Parse and set published date + if (!string.IsNullOrEmpty(publishedTime) && + DateTime.TryParse(publishedTime, CultureInfo.InvariantCulture, DateTimeStyles.AdjustToUniversal, + out DateTime parsedDate)) + { + embed.PublishedDate = parsedDate; + } + + return embed; + } + + private static string? GetMetaTagContent(IDocument doc, string property) + { + // Check for OpenGraph/Twitter style meta tags + var node = doc.QuerySelector($"meta[property='{property}'][content]") + ?? doc.QuerySelector($"meta[name='{property}'][content]"); + + return node?.GetAttribute("content")?.Trim(); + } + + private static string? GetMetaContent(IDocument doc, string name) + { + var node = doc.QuerySelector($"meta[name='{name}'][content]"); + return node?.GetAttribute("content")?.Trim(); + } + + private static string? GetFaviconUrl(IDocument doc, Uri baseUri) + { + // Look for apple-touch-icon first as it's typically higher quality + var appleIconNode = doc.QuerySelector("link[rel='apple-touch-icon'][href]"); + if (appleIconNode != null) + { + return ResolveRelativeUrl(appleIconNode.GetAttribute("href"), baseUri); + } + + // Then check for standard favicon + var faviconNode = doc.QuerySelector("link[rel='icon'][href]") ?? + doc.QuerySelector("link[rel='shortcut icon'][href]"); + + return faviconNode != null + ? ResolveRelativeUrl(faviconNode.GetAttribute("href"), baseUri) + : new Uri(baseUri, "/favicon.ico").ToString(); + } + + private static string? ResolveRelativeUrl(string? url, Uri baseUri) + { + if (string.IsNullOrEmpty(url)) + { + return null; + } + + if (Uri.TryCreate(url, UriKind.Absolute, out _)) + { + return url; // Already absolute + } + + return Uri.TryCreate(baseUri, url, out var absoluteUri) ? absoluteUri.ToString() : null; + } + + /// + /// Generate a hash-based cache key for a URL + /// + private string GenerateUrlCacheKey(string url) + { + // Normalize the URL first + var normalizedUrl = NormalizeUrl(url); + + // Create SHA256 hash of the normalized URL + using var sha256 = System.Security.Cryptography.SHA256.Create(); + var urlBytes = System.Text.Encoding.UTF8.GetBytes(normalizedUrl); + var hashBytes = sha256.ComputeHash(urlBytes); + + // Convert to hex string + var hashString = BitConverter.ToString(hashBytes).Replace("-", "").ToLowerInvariant(); + + // Return prefixed key + return $"{LinkPreviewCachePrefix}{hashString}"; + } + + /// + /// Normalize URL by trimming trailing slashes but preserving query parameters + /// + private string NormalizeUrl(string url) + { + if (string.IsNullOrEmpty(url)) + return string.Empty; + + // First ensure we have a valid URI + if (!Uri.TryCreate(url, UriKind.Absolute, out var uri)) + return url.TrimEnd('/'); + + // Rebuild the URL without trailing slashes but with query parameters + var scheme = uri.Scheme; + var host = uri.Host; + var port = uri.IsDefaultPort ? string.Empty : $":{uri.Port}"; + var path = uri.AbsolutePath.TrimEnd('/'); + var query = uri.Query; + + return $"{scheme}://{host}{port}{path}{query}".ToLowerInvariant(); + } + + /// + /// Cache a link preview + /// + private async Task CacheLinkPreview(LinkEmbed? linkEmbed, string url, TimeSpan? expiry = null) + { + if (linkEmbed == null || string.IsNullOrEmpty(url)) + return; + + try + { + var cacheKey = GenerateUrlCacheKey(url); + var expiryTime = expiry ?? TimeSpan.FromHours(24); + + await cache.SetWithGroupsAsync( + cacheKey, + linkEmbed, + [LinkPreviewCacheGroup], + expiryTime); + + logger.LogDebug("Cached link preview for URL: {Url} with key: {CacheKey}", url, cacheKey); + } + catch (Exception ex) + { + // Log but don't throw - caching failures shouldn't break the main functionality + logger.LogWarning(ex, "Failed to cache link preview for URL: {Url}", url); + } + } + + /// + /// Try to get a cached link preview + /// + private async Task GetCachedLinkPreview(string url) + { + if (string.IsNullOrEmpty(url)) + return null; + + try + { + var cacheKey = GenerateUrlCacheKey(url); + var cachedPreview = await cache.GetAsync(cacheKey); + + if (cachedPreview is not null) + logger.LogDebug("Retrieved cached link preview for URL: {Url}", url); + + return cachedPreview; + } + catch (Exception ex) + { + logger.LogWarning(ex, "Failed to retrieve cached link preview for URL: {Url}", url); + return null; + } + } + + /// + /// Invalidate cache for a specific URL + /// + public async Task InvalidateCacheForUrlAsync(string url) + { + if (string.IsNullOrEmpty(url)) + return; + + try + { + var cacheKey = GenerateUrlCacheKey(url); + await cache.RemoveAsync(cacheKey); + logger.LogDebug("Invalidated cache for URL: {Url} with key: {CacheKey}", url, cacheKey); + } + catch (Exception ex) + { + logger.LogWarning(ex, "Failed to invalidate cache for URL: {Url}", url); + } + } + + /// + /// Invalidate all cached link previews + /// + public async Task InvalidateAllCachedPreviewsAsync() + { + try + { + await cache.RemoveGroupAsync(LinkPreviewCacheGroup); + logger.LogInformation("Invalidated all cached link previews"); + } + catch (Exception ex) + { + logger.LogWarning(ex, "Failed to invalidate all cached link previews"); + } + } +} \ No newline at end of file diff --git a/DysonNetwork.Sphere/DysonNetwork.Sphere.csproj b/DysonNetwork.Sphere/DysonNetwork.Sphere.csproj index 96cd85d..655084a 100644 --- a/DysonNetwork.Sphere/DysonNetwork.Sphere.csproj +++ b/DysonNetwork.Sphere/DysonNetwork.Sphere.csproj @@ -16,6 +16,7 @@ + diff --git a/DysonNetwork.Sphere/Startup/ServiceCollectionExtensions.cs b/DysonNetwork.Sphere/Startup/ServiceCollectionExtensions.cs index 4ff96b5..7041f15 100644 --- a/DysonNetwork.Sphere/Startup/ServiceCollectionExtensions.cs +++ b/DysonNetwork.Sphere/Startup/ServiceCollectionExtensions.cs @@ -18,14 +18,13 @@ using DysonNetwork.Sphere.Storage; using DysonNetwork.Sphere.Storage.Handlers; using DysonNetwork.Sphere.Wallet; using Microsoft.AspNetCore.RateLimiting; -using Microsoft.EntityFrameworkCore; using Microsoft.OpenApi.Models; using NodaTime; using NodaTime.Serialization.SystemTextJson; -using Quartz; using StackExchange.Redis; using System.Text.Json; using System.Threading.RateLimiting; +using DysonNetwork.Sphere.Connection.WebReader; using tusdotnet.Stores; namespace DysonNetwork.Sphere.Startup; @@ -221,6 +220,7 @@ public static class ServiceCollectionExtensions services.AddScoped(); services.AddScoped(); services.AddScoped(); + services.AddScoped(); return services; } diff --git a/DysonNetwork.sln.DotSettings.user b/DysonNetwork.sln.DotSettings.user index 6ea6d0c..1a33727 100644 --- a/DysonNetwork.sln.DotSettings.user +++ b/DysonNetwork.sln.DotSettings.user @@ -74,6 +74,7 @@ ForceIncluded ForceIncluded ForceIncluded + ForceIncluded ForceIncluded ForceIncluded ForceIncluded