diff --git a/DysonNetwork.Sphere/Connection/WebReader/IEmbeddable.cs b/DysonNetwork.Sphere/Connection/WebReader/IEmbeddable.cs
new file mode 100644
index 0000000..958f05c
--- /dev/null
+++ b/DysonNetwork.Sphere/Connection/WebReader/IEmbeddable.cs
@@ -0,0 +1,26 @@
+namespace DysonNetwork.Sphere.Connection.WebReader;
+
+///
+/// The embeddable can be used in the post or messages' meta's embeds fields
+/// To render richer type of content.
+///
+/// A simple example of using link preview embed:
+///
+/// {
+/// // ... post content
+/// "meta": {
+/// "embeds": [
+/// {
+/// "type": "link",
+/// "title: "...",
+/// /// ...
+/// }
+/// ]
+/// }
+/// }
+///
+///
+public interface IEmbeddable
+{
+ public string Type { get; }
+}
\ No newline at end of file
diff --git a/DysonNetwork.Sphere/Connection/WebReader/LinkEmbed.cs b/DysonNetwork.Sphere/Connection/WebReader/LinkEmbed.cs
new file mode 100644
index 0000000..72e4849
--- /dev/null
+++ b/DysonNetwork.Sphere/Connection/WebReader/LinkEmbed.cs
@@ -0,0 +1,55 @@
+namespace DysonNetwork.Sphere.Connection.WebReader;
+
+///
+/// The link embed is a part of the embeddable implementations
+/// It can be used in the post or messages' meta's embeds fields
+///
+public class LinkEmbed : IEmbeddable
+{
+ public string Type => "link";
+
+ ///
+ /// The original URL that was processed
+ ///
+ public required string Url { get; set; }
+
+ ///
+ /// Title of the linked content (from OpenGraph og:title, meta title, or page title)
+ ///
+ public string? Title { get; set; }
+
+ ///
+ /// Description of the linked content (from OpenGraph og:description or meta description)
+ ///
+ public string? Description { get; set; }
+
+ ///
+ /// URL to the thumbnail image (from OpenGraph og:image or other meta tags)
+ ///
+ public string? ImageUrl { get; set; }
+
+ ///
+ /// The favicon URL of the site
+ ///
+ public string? FaviconUrl { get; set; }
+
+ ///
+ /// The site name (from OpenGraph og:site_name)
+ ///
+ public string? SiteName { get; set; }
+
+ ///
+ /// Type of the content (from OpenGraph og:type)
+ ///
+ public string? ContentType { get; set; }
+
+ ///
+ /// Author of the content if available
+ ///
+ public string? Author { get; set; }
+
+ ///
+ /// Published date of the content if available
+ ///
+ public DateTime? PublishedDate { get; set; }
+}
\ No newline at end of file
diff --git a/DysonNetwork.Sphere/Connection/WebReader/WebReaderController.cs b/DysonNetwork.Sphere/Connection/WebReader/WebReaderController.cs
new file mode 100644
index 0000000..c32bbb4
--- /dev/null
+++ b/DysonNetwork.Sphere/Connection/WebReader/WebReaderController.cs
@@ -0,0 +1,110 @@
+using DysonNetwork.Sphere.Permission;
+using Microsoft.AspNetCore.Authorization;
+using Microsoft.AspNetCore.Mvc;
+using Microsoft.AspNetCore.RateLimiting;
+
+namespace DysonNetwork.Sphere.Connection.WebReader;
+
+///
+/// Controller for web scraping and link preview services
+///
+[ApiController]
+[Route("/scrap")]
+[EnableRateLimiting("fixed")]
+public class WebReaderController(WebReaderService reader, ILogger logger)
+ : ControllerBase
+{
+ ///
+ /// Retrieves a preview for the provided URL
+ ///
+ /// URL-encoded link to generate preview for
+ /// Link preview data including title, description, and image
+ [HttpGet("link")]
+ public async Task> ScrapLink([FromQuery] string url)
+ {
+ if (string.IsNullOrEmpty(url))
+ {
+ return BadRequest(new { error = "URL parameter is required" });
+ }
+
+ try
+ {
+ // Ensure URL is properly decoded
+ var decodedUrl = UrlDecoder.Decode(url);
+
+ // Validate URL format
+ if (!Uri.TryCreate(decodedUrl, UriKind.Absolute, out _))
+ {
+ return BadRequest(new { error = "Invalid URL format" });
+ }
+
+ var linkEmbed = await reader.GetLinkPreviewAsync(decodedUrl);
+ return Ok(linkEmbed);
+ }
+ catch (WebReaderException ex)
+ {
+ logger.LogWarning(ex, "Error scraping link: {Url}", url);
+ return BadRequest(new { error = ex.Message });
+ }
+ catch (Exception ex)
+ {
+ logger.LogError(ex, "Unexpected error scraping link: {Url}", url);
+ return StatusCode(StatusCodes.Status500InternalServerError,
+ new { error = "An unexpected error occurred while processing the link" });
+ }
+ }
+
+ ///
+ /// Force invalidates the cache for a specific URL
+ ///
+ [HttpDelete("link/cache")]
+ [Authorize]
+ [RequiredPermission("maintenance", "cache.scrap")]
+ public async Task InvalidateCache([FromQuery] string url)
+ {
+ if (string.IsNullOrEmpty(url))
+ {
+ return BadRequest(new { error = "URL parameter is required" });
+ }
+
+ await reader.InvalidateCacheForUrlAsync(url);
+ return Ok(new { message = "Cache invalidated for URL" });
+ }
+
+ ///
+ /// Force invalidates all cached link previews
+ ///
+ [HttpDelete("cache/all")]
+ [Authorize]
+ [RequiredPermission("maintenance", "cache.scrap")]
+ public async Task InvalidateAllCache()
+ {
+ await reader.InvalidateAllCachedPreviewsAsync();
+ return Ok(new { message = "All link preview caches invalidated" });
+ }
+}
+
+///
+/// Helper class for URL decoding
+///
+public static class UrlDecoder
+{
+ public static string Decode(string url)
+ {
+ // First check if URL is already decoded
+ if (!url.Contains('%') && !url.Contains('+'))
+ {
+ return url;
+ }
+
+ try
+ {
+ return System.Net.WebUtility.UrlDecode(url);
+ }
+ catch
+ {
+ // If decoding fails, return the original string
+ return url;
+ }
+ }
+}
\ No newline at end of file
diff --git a/DysonNetwork.Sphere/Connection/WebReader/WebReaderException.cs b/DysonNetwork.Sphere/Connection/WebReader/WebReaderException.cs
new file mode 100644
index 0000000..31b8032
--- /dev/null
+++ b/DysonNetwork.Sphere/Connection/WebReader/WebReaderException.cs
@@ -0,0 +1,17 @@
+using System;
+
+namespace DysonNetwork.Sphere.Connection.WebReader;
+
+///
+/// Exception thrown when an error occurs during web reading operations
+///
+public class WebReaderException : Exception
+{
+ public WebReaderException(string message) : base(message)
+ {
+ }
+
+ public WebReaderException(string message, Exception innerException) : base(message, innerException)
+ {
+ }
+}
diff --git a/DysonNetwork.Sphere/Connection/WebReader/WebReaderService.cs b/DysonNetwork.Sphere/Connection/WebReader/WebReaderService.cs
new file mode 100644
index 0000000..b86b352
--- /dev/null
+++ b/DysonNetwork.Sphere/Connection/WebReader/WebReaderService.cs
@@ -0,0 +1,334 @@
+using System.Globalization;
+using AngleSharp;
+using AngleSharp.Dom;
+using DysonNetwork.Sphere.Storage;
+
+namespace DysonNetwork.Sphere.Connection.WebReader;
+
+///
+/// The service is amin to providing scrapping service to the Solar Network.
+/// Such as news feed, external articles and link preview.
+///
+public class WebReaderService(
+ IHttpClientFactory httpClientFactory,
+ ILogger logger,
+ ICacheService cache)
+{
+ private const string LinkPreviewCachePrefix = "scrap:preview:";
+ private const string LinkPreviewCacheGroup = "scrap:preview";
+
+ ///
+ /// Generate a link preview embed from a URL
+ ///
+ /// The URL to generate the preview for
+ /// Cancellation token
+ /// If true, bypass cache and fetch fresh data
+ /// Custom cache expiration time
+ /// A LinkEmbed object containing the preview data
+ public async Task GetLinkPreviewAsync(
+ string url,
+ CancellationToken cancellationToken = default,
+ TimeSpan? cacheExpiry = null,
+ bool bypassCache = false
+ )
+ {
+ // Ensure URL is valid
+ if (!Uri.TryCreate(url, UriKind.Absolute, out var uri))
+ {
+ throw new ArgumentException(@"Invalid URL format", nameof(url));
+ }
+
+ // Try to get from cache if not bypassing
+ if (!bypassCache)
+ {
+ var cachedPreview = await GetCachedLinkPreview(url);
+ if (cachedPreview is not null)
+ return cachedPreview;
+ }
+
+ // Cache miss or bypass, fetch fresh data
+ logger.LogDebug("Fetching fresh link preview for URL: {Url}", url);
+ var httpClient = httpClientFactory.CreateClient("WebReader");
+ httpClient.MaxResponseContentBufferSize = 10 * 1024 * 1024; // 10MB, prevent scrap some directly accessible files
+ httpClient.Timeout = TimeSpan.FromSeconds(3);
+ httpClient.DefaultRequestHeaders.Add("User-Agent", "DysonNetwork/1.0 LinkPreview Bot");
+
+ try
+ {
+ var response = await httpClient.GetAsync(url, cancellationToken);
+ response.EnsureSuccessStatusCode();
+
+ var contentType = response.Content.Headers.ContentType?.MediaType;
+ if (contentType == null || !contentType.StartsWith("text/html"))
+ {
+ logger.LogWarning("URL is not an HTML page: {Url}, ContentType: {ContentType}", url, contentType);
+ var nonHtmlEmbed = new LinkEmbed
+ {
+ Url = url,
+ Title = uri.Host,
+ ContentType = contentType
+ };
+
+ // Cache non-HTML responses too
+ await CacheLinkPreview(nonHtmlEmbed, url, cacheExpiry);
+ return nonHtmlEmbed;
+ }
+
+ var html = await response.Content.ReadAsStringAsync(cancellationToken);
+ var linkEmbed = await ExtractLinkData(url, html, uri);
+
+ // Cache the result
+ await CacheLinkPreview(linkEmbed, url, cacheExpiry);
+
+ return linkEmbed;
+ }
+ catch (HttpRequestException ex)
+ {
+ logger.LogError(ex, "Failed to fetch URL: {Url}", url);
+ throw new WebReaderException($"Failed to fetch URL: {url}", ex);
+ }
+ }
+
+ private async Task ExtractLinkData(string url, string html, Uri uri)
+ {
+ var embed = new LinkEmbed
+ {
+ Url = url
+ };
+
+ // Configure AngleSharp context
+ var config = Configuration.Default;
+ var context = BrowsingContext.New(config);
+ var document = await context.OpenAsync(req => req.Content(html));
+
+ // Extract OpenGraph tags
+ var ogTitle = GetMetaTagContent(document, "og:title");
+ var ogDescription = GetMetaTagContent(document, "og:description");
+ var ogImage = GetMetaTagContent(document, "og:image");
+ var ogSiteName = GetMetaTagContent(document, "og:site_name");
+ var ogType = GetMetaTagContent(document, "og:type");
+
+ // Extract Twitter card tags as fallback
+ var twitterTitle = GetMetaTagContent(document, "twitter:title");
+ var twitterDescription = GetMetaTagContent(document, "twitter:description");
+ var twitterImage = GetMetaTagContent(document, "twitter:image");
+
+ // Extract standard meta tags as final fallback
+ var metaTitle = GetMetaTagContent(document, "title") ??
+ GetMetaContent(document, "title");
+ var metaDescription = GetMetaTagContent(document, "description");
+
+ // Extract page title
+ var pageTitle = document.Title?.Trim();
+
+ // Extract publish date
+ var publishedTime = GetMetaTagContent(document, "article:published_time") ??
+ GetMetaTagContent(document, "datePublished") ??
+ GetMetaTagContent(document, "pubdate");
+
+ // Extract author
+ var author = GetMetaTagContent(document, "author") ??
+ GetMetaTagContent(document, "article:author");
+
+ // Extract favicon
+ var faviconUrl = GetFaviconUrl(document, uri);
+
+ // Populate the embed with the data, prioritizing OpenGraph
+ embed.Title = ogTitle ?? twitterTitle ?? metaTitle ?? pageTitle ?? uri.Host;
+ embed.Description = ogDescription ?? twitterDescription ?? metaDescription;
+ embed.ImageUrl = ResolveRelativeUrl(ogImage ?? twitterImage, uri);
+ embed.SiteName = ogSiteName ?? uri.Host;
+ embed.ContentType = ogType;
+ embed.FaviconUrl = faviconUrl;
+ embed.Author = author;
+
+ // Parse and set published date
+ if (!string.IsNullOrEmpty(publishedTime) &&
+ DateTime.TryParse(publishedTime, CultureInfo.InvariantCulture, DateTimeStyles.AdjustToUniversal,
+ out DateTime parsedDate))
+ {
+ embed.PublishedDate = parsedDate;
+ }
+
+ return embed;
+ }
+
+ private static string? GetMetaTagContent(IDocument doc, string property)
+ {
+ // Check for OpenGraph/Twitter style meta tags
+ var node = doc.QuerySelector($"meta[property='{property}'][content]")
+ ?? doc.QuerySelector($"meta[name='{property}'][content]");
+
+ return node?.GetAttribute("content")?.Trim();
+ }
+
+ private static string? GetMetaContent(IDocument doc, string name)
+ {
+ var node = doc.QuerySelector($"meta[name='{name}'][content]");
+ return node?.GetAttribute("content")?.Trim();
+ }
+
+ private static string? GetFaviconUrl(IDocument doc, Uri baseUri)
+ {
+ // Look for apple-touch-icon first as it's typically higher quality
+ var appleIconNode = doc.QuerySelector("link[rel='apple-touch-icon'][href]");
+ if (appleIconNode != null)
+ {
+ return ResolveRelativeUrl(appleIconNode.GetAttribute("href"), baseUri);
+ }
+
+ // Then check for standard favicon
+ var faviconNode = doc.QuerySelector("link[rel='icon'][href]") ??
+ doc.QuerySelector("link[rel='shortcut icon'][href]");
+
+ return faviconNode != null
+ ? ResolveRelativeUrl(faviconNode.GetAttribute("href"), baseUri)
+ : new Uri(baseUri, "/favicon.ico").ToString();
+ }
+
+ private static string? ResolveRelativeUrl(string? url, Uri baseUri)
+ {
+ if (string.IsNullOrEmpty(url))
+ {
+ return null;
+ }
+
+ if (Uri.TryCreate(url, UriKind.Absolute, out _))
+ {
+ return url; // Already absolute
+ }
+
+ return Uri.TryCreate(baseUri, url, out var absoluteUri) ? absoluteUri.ToString() : null;
+ }
+
+ ///
+ /// Generate a hash-based cache key for a URL
+ ///
+ private string GenerateUrlCacheKey(string url)
+ {
+ // Normalize the URL first
+ var normalizedUrl = NormalizeUrl(url);
+
+ // Create SHA256 hash of the normalized URL
+ using var sha256 = System.Security.Cryptography.SHA256.Create();
+ var urlBytes = System.Text.Encoding.UTF8.GetBytes(normalizedUrl);
+ var hashBytes = sha256.ComputeHash(urlBytes);
+
+ // Convert to hex string
+ var hashString = BitConverter.ToString(hashBytes).Replace("-", "").ToLowerInvariant();
+
+ // Return prefixed key
+ return $"{LinkPreviewCachePrefix}{hashString}";
+ }
+
+ ///
+ /// Normalize URL by trimming trailing slashes but preserving query parameters
+ ///
+ private string NormalizeUrl(string url)
+ {
+ if (string.IsNullOrEmpty(url))
+ return string.Empty;
+
+ // First ensure we have a valid URI
+ if (!Uri.TryCreate(url, UriKind.Absolute, out var uri))
+ return url.TrimEnd('/');
+
+ // Rebuild the URL without trailing slashes but with query parameters
+ var scheme = uri.Scheme;
+ var host = uri.Host;
+ var port = uri.IsDefaultPort ? string.Empty : $":{uri.Port}";
+ var path = uri.AbsolutePath.TrimEnd('/');
+ var query = uri.Query;
+
+ return $"{scheme}://{host}{port}{path}{query}".ToLowerInvariant();
+ }
+
+ ///
+ /// Cache a link preview
+ ///
+ private async Task CacheLinkPreview(LinkEmbed? linkEmbed, string url, TimeSpan? expiry = null)
+ {
+ if (linkEmbed == null || string.IsNullOrEmpty(url))
+ return;
+
+ try
+ {
+ var cacheKey = GenerateUrlCacheKey(url);
+ var expiryTime = expiry ?? TimeSpan.FromHours(24);
+
+ await cache.SetWithGroupsAsync(
+ cacheKey,
+ linkEmbed,
+ [LinkPreviewCacheGroup],
+ expiryTime);
+
+ logger.LogDebug("Cached link preview for URL: {Url} with key: {CacheKey}", url, cacheKey);
+ }
+ catch (Exception ex)
+ {
+ // Log but don't throw - caching failures shouldn't break the main functionality
+ logger.LogWarning(ex, "Failed to cache link preview for URL: {Url}", url);
+ }
+ }
+
+ ///
+ /// Try to get a cached link preview
+ ///
+ private async Task GetCachedLinkPreview(string url)
+ {
+ if (string.IsNullOrEmpty(url))
+ return null;
+
+ try
+ {
+ var cacheKey = GenerateUrlCacheKey(url);
+ var cachedPreview = await cache.GetAsync(cacheKey);
+
+ if (cachedPreview is not null)
+ logger.LogDebug("Retrieved cached link preview for URL: {Url}", url);
+
+ return cachedPreview;
+ }
+ catch (Exception ex)
+ {
+ logger.LogWarning(ex, "Failed to retrieve cached link preview for URL: {Url}", url);
+ return null;
+ }
+ }
+
+ ///
+ /// Invalidate cache for a specific URL
+ ///
+ public async Task InvalidateCacheForUrlAsync(string url)
+ {
+ if (string.IsNullOrEmpty(url))
+ return;
+
+ try
+ {
+ var cacheKey = GenerateUrlCacheKey(url);
+ await cache.RemoveAsync(cacheKey);
+ logger.LogDebug("Invalidated cache for URL: {Url} with key: {CacheKey}", url, cacheKey);
+ }
+ catch (Exception ex)
+ {
+ logger.LogWarning(ex, "Failed to invalidate cache for URL: {Url}", url);
+ }
+ }
+
+ ///
+ /// Invalidate all cached link previews
+ ///
+ public async Task InvalidateAllCachedPreviewsAsync()
+ {
+ try
+ {
+ await cache.RemoveGroupAsync(LinkPreviewCacheGroup);
+ logger.LogInformation("Invalidated all cached link previews");
+ }
+ catch (Exception ex)
+ {
+ logger.LogWarning(ex, "Failed to invalidate all cached link previews");
+ }
+ }
+}
\ No newline at end of file
diff --git a/DysonNetwork.Sphere/DysonNetwork.Sphere.csproj b/DysonNetwork.Sphere/DysonNetwork.Sphere.csproj
index 96cd85d..655084a 100644
--- a/DysonNetwork.Sphere/DysonNetwork.Sphere.csproj
+++ b/DysonNetwork.Sphere/DysonNetwork.Sphere.csproj
@@ -16,6 +16,7 @@
+
diff --git a/DysonNetwork.Sphere/Startup/ServiceCollectionExtensions.cs b/DysonNetwork.Sphere/Startup/ServiceCollectionExtensions.cs
index 4ff96b5..7041f15 100644
--- a/DysonNetwork.Sphere/Startup/ServiceCollectionExtensions.cs
+++ b/DysonNetwork.Sphere/Startup/ServiceCollectionExtensions.cs
@@ -18,14 +18,13 @@ using DysonNetwork.Sphere.Storage;
using DysonNetwork.Sphere.Storage.Handlers;
using DysonNetwork.Sphere.Wallet;
using Microsoft.AspNetCore.RateLimiting;
-using Microsoft.EntityFrameworkCore;
using Microsoft.OpenApi.Models;
using NodaTime;
using NodaTime.Serialization.SystemTextJson;
-using Quartz;
using StackExchange.Redis;
using System.Text.Json;
using System.Threading.RateLimiting;
+using DysonNetwork.Sphere.Connection.WebReader;
using tusdotnet.Stores;
namespace DysonNetwork.Sphere.Startup;
@@ -221,6 +220,7 @@ public static class ServiceCollectionExtensions
services.AddScoped();
services.AddScoped();
services.AddScoped();
+ services.AddScoped();
return services;
}
diff --git a/DysonNetwork.sln.DotSettings.user b/DysonNetwork.sln.DotSettings.user
index 6ea6d0c..1a33727 100644
--- a/DysonNetwork.sln.DotSettings.user
+++ b/DysonNetwork.sln.DotSettings.user
@@ -74,6 +74,7 @@
ForceIncluded
ForceIncluded
ForceIncluded
+ ForceIncluded
ForceIncluded
ForceIncluded
ForceIncluded