using System.Globalization; using AngleSharp; using AngleSharp.Dom; using DysonNetwork.Sphere.Storage; namespace DysonNetwork.Sphere.Connection.WebReader; /// /// The service is amin to providing scrapping service to the Solar Network. /// Such as news feed, external articles and link preview. /// public class WebReaderService( IHttpClientFactory httpClientFactory, ILogger logger, ICacheService cache) { private const string LinkPreviewCachePrefix = "scrap:preview:"; private const string LinkPreviewCacheGroup = "scrap:preview"; /// /// Generate a link preview embed from a URL /// /// The URL to generate the preview for /// Cancellation token /// If true, bypass cache and fetch fresh data /// Custom cache expiration time /// A LinkEmbed object containing the preview data public async Task GetLinkPreviewAsync( string url, CancellationToken cancellationToken = default, TimeSpan? cacheExpiry = null, bool bypassCache = false ) { // Ensure URL is valid if (!Uri.TryCreate(url, UriKind.Absolute, out var uri)) { throw new ArgumentException(@"Invalid URL format", nameof(url)); } // Try to get from cache if not bypassing if (!bypassCache) { var cachedPreview = await GetCachedLinkPreview(url); if (cachedPreview is not null) return cachedPreview; } // Cache miss or bypass, fetch fresh data logger.LogDebug("Fetching fresh link preview for URL: {Url}", url); var httpClient = httpClientFactory.CreateClient("WebReader"); httpClient.MaxResponseContentBufferSize = 10 * 1024 * 1024; // 10MB, prevent scrap some directly accessible files httpClient.Timeout = TimeSpan.FromSeconds(3); httpClient.DefaultRequestHeaders.Add("User-Agent", "DysonNetwork/1.0 LinkPreview Bot"); try { var response = await httpClient.GetAsync(url, cancellationToken); response.EnsureSuccessStatusCode(); var contentType = response.Content.Headers.ContentType?.MediaType; if (contentType == null || !contentType.StartsWith("text/html")) { logger.LogWarning("URL is not an HTML page: {Url}, ContentType: {ContentType}", url, contentType); var nonHtmlEmbed = new LinkEmbed { Url = url, Title = uri.Host, ContentType = contentType }; // Cache non-HTML responses too await CacheLinkPreview(nonHtmlEmbed, url, cacheExpiry); return nonHtmlEmbed; } var html = await response.Content.ReadAsStringAsync(cancellationToken); var linkEmbed = await ExtractLinkData(url, html, uri); // Cache the result await CacheLinkPreview(linkEmbed, url, cacheExpiry); return linkEmbed; } catch (HttpRequestException ex) { logger.LogError(ex, "Failed to fetch URL: {Url}", url); throw new WebReaderException($"Failed to fetch URL: {url}", ex); } } private async Task ExtractLinkData(string url, string html, Uri uri) { var embed = new LinkEmbed { Url = url }; // Configure AngleSharp context var config = Configuration.Default; var context = BrowsingContext.New(config); var document = await context.OpenAsync(req => req.Content(html)); // Extract OpenGraph tags var ogTitle = GetMetaTagContent(document, "og:title"); var ogDescription = GetMetaTagContent(document, "og:description"); var ogImage = GetMetaTagContent(document, "og:image"); var ogSiteName = GetMetaTagContent(document, "og:site_name"); var ogType = GetMetaTagContent(document, "og:type"); // Extract Twitter card tags as fallback var twitterTitle = GetMetaTagContent(document, "twitter:title"); var twitterDescription = GetMetaTagContent(document, "twitter:description"); var twitterImage = GetMetaTagContent(document, "twitter:image"); // Extract standard meta tags as final fallback var metaTitle = GetMetaTagContent(document, "title") ?? GetMetaContent(document, "title"); var metaDescription = GetMetaTagContent(document, "description"); // Extract page title var pageTitle = document.Title?.Trim(); // Extract publish date var publishedTime = GetMetaTagContent(document, "article:published_time") ?? GetMetaTagContent(document, "datePublished") ?? GetMetaTagContent(document, "pubdate"); // Extract author var author = GetMetaTagContent(document, "author") ?? GetMetaTagContent(document, "article:author"); // Extract favicon var faviconUrl = GetFaviconUrl(document, uri); // Populate the embed with the data, prioritizing OpenGraph embed.Title = ogTitle ?? twitterTitle ?? metaTitle ?? pageTitle ?? uri.Host; embed.Description = ogDescription ?? twitterDescription ?? metaDescription; embed.ImageUrl = ResolveRelativeUrl(ogImage ?? twitterImage, uri); embed.SiteName = ogSiteName ?? uri.Host; embed.ContentType = ogType; embed.FaviconUrl = faviconUrl; embed.Author = author; // Parse and set published date if (!string.IsNullOrEmpty(publishedTime) && DateTime.TryParse(publishedTime, CultureInfo.InvariantCulture, DateTimeStyles.AdjustToUniversal, out DateTime parsedDate)) { embed.PublishedDate = parsedDate; } return embed; } private static string? GetMetaTagContent(IDocument doc, string property) { // Check for OpenGraph/Twitter style meta tags var node = doc.QuerySelector($"meta[property='{property}'][content]") ?? doc.QuerySelector($"meta[name='{property}'][content]"); return node?.GetAttribute("content")?.Trim(); } private static string? GetMetaContent(IDocument doc, string name) { var node = doc.QuerySelector($"meta[name='{name}'][content]"); return node?.GetAttribute("content")?.Trim(); } private static string? GetFaviconUrl(IDocument doc, Uri baseUri) { // Look for apple-touch-icon first as it's typically higher quality var appleIconNode = doc.QuerySelector("link[rel='apple-touch-icon'][href]"); if (appleIconNode != null) { return ResolveRelativeUrl(appleIconNode.GetAttribute("href"), baseUri); } // Then check for standard favicon var faviconNode = doc.QuerySelector("link[rel='icon'][href]") ?? doc.QuerySelector("link[rel='shortcut icon'][href]"); return faviconNode != null ? ResolveRelativeUrl(faviconNode.GetAttribute("href"), baseUri) : new Uri(baseUri, "/favicon.ico").ToString(); } private static string? ResolveRelativeUrl(string? url, Uri baseUri) { if (string.IsNullOrEmpty(url)) { return null; } if (Uri.TryCreate(url, UriKind.Absolute, out _)) { return url; // Already absolute } return Uri.TryCreate(baseUri, url, out var absoluteUri) ? absoluteUri.ToString() : null; } /// /// Generate a hash-based cache key for a URL /// private string GenerateUrlCacheKey(string url) { // Normalize the URL first var normalizedUrl = NormalizeUrl(url); // Create SHA256 hash of the normalized URL using var sha256 = System.Security.Cryptography.SHA256.Create(); var urlBytes = System.Text.Encoding.UTF8.GetBytes(normalizedUrl); var hashBytes = sha256.ComputeHash(urlBytes); // Convert to hex string var hashString = BitConverter.ToString(hashBytes).Replace("-", "").ToLowerInvariant(); // Return prefixed key return $"{LinkPreviewCachePrefix}{hashString}"; } /// /// Normalize URL by trimming trailing slashes but preserving query parameters /// private string NormalizeUrl(string url) { if (string.IsNullOrEmpty(url)) return string.Empty; // First ensure we have a valid URI if (!Uri.TryCreate(url, UriKind.Absolute, out var uri)) return url.TrimEnd('/'); // Rebuild the URL without trailing slashes but with query parameters var scheme = uri.Scheme; var host = uri.Host; var port = uri.IsDefaultPort ? string.Empty : $":{uri.Port}"; var path = uri.AbsolutePath.TrimEnd('/'); var query = uri.Query; return $"{scheme}://{host}{port}{path}{query}".ToLowerInvariant(); } /// /// Cache a link preview /// private async Task CacheLinkPreview(LinkEmbed? linkEmbed, string url, TimeSpan? expiry = null) { if (linkEmbed == null || string.IsNullOrEmpty(url)) return; try { var cacheKey = GenerateUrlCacheKey(url); var expiryTime = expiry ?? TimeSpan.FromHours(24); await cache.SetWithGroupsAsync( cacheKey, linkEmbed, [LinkPreviewCacheGroup], expiryTime); logger.LogDebug("Cached link preview for URL: {Url} with key: {CacheKey}", url, cacheKey); } catch (Exception ex) { // Log but don't throw - caching failures shouldn't break the main functionality logger.LogWarning(ex, "Failed to cache link preview for URL: {Url}", url); } } /// /// Try to get a cached link preview /// private async Task GetCachedLinkPreview(string url) { if (string.IsNullOrEmpty(url)) return null; try { var cacheKey = GenerateUrlCacheKey(url); var cachedPreview = await cache.GetAsync(cacheKey); if (cachedPreview is not null) logger.LogDebug("Retrieved cached link preview for URL: {Url}", url); return cachedPreview; } catch (Exception ex) { logger.LogWarning(ex, "Failed to retrieve cached link preview for URL: {Url}", url); return null; } } /// /// Invalidate cache for a specific URL /// public async Task InvalidateCacheForUrlAsync(string url) { if (string.IsNullOrEmpty(url)) return; try { var cacheKey = GenerateUrlCacheKey(url); await cache.RemoveAsync(cacheKey); logger.LogDebug("Invalidated cache for URL: {Url} with key: {CacheKey}", url, cacheKey); } catch (Exception ex) { logger.LogWarning(ex, "Failed to invalidate cache for URL: {Url}", url); } } /// /// Invalidate all cached link previews /// public async Task InvalidateAllCachedPreviewsAsync() { try { await cache.RemoveGroupAsync(LinkPreviewCacheGroup); logger.LogInformation("Invalidated all cached link previews"); } catch (Exception ex) { logger.LogWarning(ex, "Failed to invalidate all cached link previews"); } } }