367 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			C#
		
	
	
	
	
	
			
		
		
	
	
			367 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			C#
		
	
	
	
	
	
using System.Globalization;
 | 
						|
using AngleSharp;
 | 
						|
using AngleSharp.Dom;
 | 
						|
using DysonNetwork.Shared.Cache;
 | 
						|
using HtmlAgilityPack;
 | 
						|
 | 
						|
namespace DysonNetwork.Sphere.WebReader;
 | 
						|
 | 
						|
/// <summary>
 | 
						|
/// The service is amin to providing scrapping service to the Solar Network.
 | 
						|
/// Such as news feed, external articles and link preview.
 | 
						|
/// </summary>
 | 
						|
public class WebReaderService(
 | 
						|
    IHttpClientFactory httpClientFactory,
 | 
						|
    ILogger<WebReaderService> logger,
 | 
						|
    ICacheService cache
 | 
						|
)
 | 
						|
{
 | 
						|
    private const string LinkPreviewCachePrefix = "scrap:preview:";
 | 
						|
    private const string LinkPreviewCacheGroup = "scrap:preview";
 | 
						|
 | 
						|
    public async Task<ScrapedArticle> ScrapeArticleAsync(string url, CancellationToken cancellationToken = default)
 | 
						|
    {
 | 
						|
        var linkEmbed = await GetLinkPreviewAsync(url, cancellationToken);
 | 
						|
        var content = await GetArticleContentAsync(url, cancellationToken);
 | 
						|
        return new ScrapedArticle
 | 
						|
        {
 | 
						|
            LinkEmbed = linkEmbed,
 | 
						|
            Content = content
 | 
						|
        };
 | 
						|
    }
 | 
						|
 | 
						|
    private async Task<string?> GetArticleContentAsync(string url, CancellationToken cancellationToken)
 | 
						|
    {
 | 
						|
        var httpClient = httpClientFactory.CreateClient("WebReader");
 | 
						|
        var response = await httpClient.GetAsync(url, cancellationToken);
 | 
						|
        if (!response.IsSuccessStatusCode)
 | 
						|
        {
 | 
						|
            logger.LogWarning("Failed to scrap article content for URL: {Url}", url);
 | 
						|
            return null;
 | 
						|
        }
 | 
						|
 | 
						|
        var html = await response.Content.ReadAsStringAsync(cancellationToken);
 | 
						|
        var doc = new HtmlDocument();
 | 
						|
        doc.LoadHtml(html);
 | 
						|
        var articleNode = doc.DocumentNode.SelectSingleNode("//article");
 | 
						|
        return articleNode?.InnerHtml;
 | 
						|
    }
 | 
						|
 | 
						|
 | 
						|
    /// <summary>
 | 
						|
    /// Generate a link preview embed from a URL
 | 
						|
    /// </summary>
 | 
						|
    /// <param name="url">The URL to generate the preview for</param>
 | 
						|
    /// <param name="cancellationToken">Cancellation token</param>
 | 
						|
    /// <param name="bypassCache">If true, bypass cache and fetch fresh data</param>
 | 
						|
    /// <param name="cacheExpiry">Custom cache expiration time</param>
 | 
						|
    /// <returns>A LinkEmbed object containing the preview data</returns>
 | 
						|
    public async Task<LinkEmbed> GetLinkPreviewAsync(
 | 
						|
        string url,
 | 
						|
        CancellationToken cancellationToken = default,
 | 
						|
        TimeSpan? cacheExpiry = null,
 | 
						|
        bool bypassCache = false
 | 
						|
    )
 | 
						|
    {
 | 
						|
        // Ensure URL is valid
 | 
						|
        if (!Uri.TryCreate(url, UriKind.Absolute, out var uri))
 | 
						|
        {
 | 
						|
            throw new ArgumentException(@"Invalid URL format", nameof(url));
 | 
						|
        }
 | 
						|
 | 
						|
        // Try to get from cache if not bypassing
 | 
						|
        if (!bypassCache)
 | 
						|
        {
 | 
						|
            var cachedPreview = await GetCachedLinkPreview(url);
 | 
						|
            if (cachedPreview is not null)
 | 
						|
                return cachedPreview;
 | 
						|
        }
 | 
						|
 | 
						|
        // Cache miss or bypass, fetch fresh data
 | 
						|
        logger.LogDebug("Fetching fresh link preview for URL: {Url}", url);
 | 
						|
        var httpClient = httpClientFactory.CreateClient("WebReader");
 | 
						|
        httpClient.MaxResponseContentBufferSize =
 | 
						|
            10 * 1024 * 1024; // 10MB, prevent scrap some directly accessible files
 | 
						|
        httpClient.Timeout = TimeSpan.FromSeconds(3);
 | 
						|
        // Setting UA to facebook's bot to get the opengraph.
 | 
						|
        httpClient.DefaultRequestHeaders.Add("User-Agent", "facebookexternalhit/1.1");
 | 
						|
 | 
						|
        try
 | 
						|
        {
 | 
						|
            var response = await httpClient.GetAsync(url, cancellationToken);
 | 
						|
            response.EnsureSuccessStatusCode();
 | 
						|
 | 
						|
            var contentType = response.Content.Headers.ContentType?.MediaType;
 | 
						|
            if (contentType == null || !contentType.StartsWith("text/html"))
 | 
						|
            {
 | 
						|
                logger.LogWarning("URL is not an HTML page: {Url}, ContentType: {ContentType}", url, contentType);
 | 
						|
                var nonHtmlEmbed = new LinkEmbed
 | 
						|
                {
 | 
						|
                    Url = url,
 | 
						|
                    Title = uri.Host,
 | 
						|
                    ContentType = contentType
 | 
						|
                };
 | 
						|
 | 
						|
                // Cache non-HTML responses too
 | 
						|
                await CacheLinkPreview(nonHtmlEmbed, url, cacheExpiry);
 | 
						|
                return nonHtmlEmbed;
 | 
						|
            }
 | 
						|
 | 
						|
            var html = await response.Content.ReadAsStringAsync(cancellationToken);
 | 
						|
            var linkEmbed = await ExtractLinkData(url, html, uri);
 | 
						|
 | 
						|
            // Cache the result
 | 
						|
            await CacheLinkPreview(linkEmbed, url, cacheExpiry);
 | 
						|
 | 
						|
            return linkEmbed;
 | 
						|
        }
 | 
						|
        catch (HttpRequestException ex)
 | 
						|
        {
 | 
						|
            logger.LogError(ex, "Failed to fetch URL: {Url}", url);
 | 
						|
            throw new WebReaderException($"Failed to fetch URL: {url}", ex);
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    private async Task<LinkEmbed> ExtractLinkData(string url, string html, Uri uri)
 | 
						|
    {
 | 
						|
        var embed = new LinkEmbed
 | 
						|
        {
 | 
						|
            Url = url
 | 
						|
        };
 | 
						|
 | 
						|
        // Configure AngleSharp context
 | 
						|
        var config = Configuration.Default;
 | 
						|
        var context = BrowsingContext.New(config);
 | 
						|
        var document = await context.OpenAsync(req => req.Content(html));
 | 
						|
 | 
						|
        // Extract OpenGraph tags
 | 
						|
        var ogTitle = GetMetaTagContent(document, "og:title");
 | 
						|
        var ogDescription = GetMetaTagContent(document, "og:description");
 | 
						|
        var ogImage = GetMetaTagContent(document, "og:image");
 | 
						|
        var ogSiteName = GetMetaTagContent(document, "og:site_name");
 | 
						|
        var ogType = GetMetaTagContent(document, "og:type");
 | 
						|
 | 
						|
        // Extract Twitter card tags as fallback
 | 
						|
        var twitterTitle = GetMetaTagContent(document, "twitter:title");
 | 
						|
        var twitterDescription = GetMetaTagContent(document, "twitter:description");
 | 
						|
        var twitterImage = GetMetaTagContent(document, "twitter:image");
 | 
						|
 | 
						|
        // Extract standard meta tags as final fallback
 | 
						|
        var metaTitle = GetMetaTagContent(document, "title") ??
 | 
						|
                        GetMetaContent(document, "title");
 | 
						|
        var metaDescription = GetMetaTagContent(document, "description");
 | 
						|
 | 
						|
        // Extract page title
 | 
						|
        var pageTitle = document.Title?.Trim();
 | 
						|
 | 
						|
        // Extract publish date
 | 
						|
        var publishedTime = GetMetaTagContent(document, "article:published_time") ??
 | 
						|
                            GetMetaTagContent(document, "datePublished") ??
 | 
						|
                            GetMetaTagContent(document, "pubdate");
 | 
						|
 | 
						|
        // Extract author
 | 
						|
        var author = GetMetaTagContent(document, "author") ??
 | 
						|
                     GetMetaTagContent(document, "article:author");
 | 
						|
 | 
						|
        // Extract favicon
 | 
						|
        var faviconUrl = GetFaviconUrl(document, uri);
 | 
						|
 | 
						|
        // Populate the embed with the data, prioritizing OpenGraph
 | 
						|
        embed.Title = ogTitle ?? twitterTitle ?? metaTitle ?? pageTitle ?? uri.Host;
 | 
						|
        embed.Description = ogDescription ?? twitterDescription ?? metaDescription;
 | 
						|
        embed.ImageUrl = ResolveRelativeUrl(ogImage ?? twitterImage, uri);
 | 
						|
        embed.SiteName = ogSiteName ?? uri.Host;
 | 
						|
        embed.ContentType = ogType;
 | 
						|
        embed.FaviconUrl = faviconUrl;
 | 
						|
        embed.Author = author;
 | 
						|
 | 
						|
        // Parse and set published date
 | 
						|
        if (!string.IsNullOrEmpty(publishedTime) &&
 | 
						|
            DateTime.TryParse(publishedTime, CultureInfo.InvariantCulture, DateTimeStyles.AdjustToUniversal,
 | 
						|
                out DateTime parsedDate))
 | 
						|
        {
 | 
						|
            embed.PublishedDate = parsedDate;
 | 
						|
        }
 | 
						|
 | 
						|
        return embed;
 | 
						|
    }
 | 
						|
 | 
						|
    private static string? GetMetaTagContent(IDocument doc, string property)
 | 
						|
    {
 | 
						|
        // Check for OpenGraph/Twitter style meta tags
 | 
						|
        var node = doc.QuerySelector($"meta[property='{property}'][content]")
 | 
						|
                   ?? doc.QuerySelector($"meta[name='{property}'][content]");
 | 
						|
 | 
						|
        return node?.GetAttribute("content")?.Trim();
 | 
						|
    }
 | 
						|
 | 
						|
    private static string? GetMetaContent(IDocument doc, string name)
 | 
						|
    {
 | 
						|
        var node = doc.QuerySelector($"meta[name='{name}'][content]");
 | 
						|
        return node?.GetAttribute("content")?.Trim();
 | 
						|
    }
 | 
						|
 | 
						|
    private static string? GetFaviconUrl(IDocument doc, Uri baseUri)
 | 
						|
    {
 | 
						|
        // Look for apple-touch-icon first as it's typically higher quality
 | 
						|
        var appleIconNode = doc.QuerySelector("link[rel='apple-touch-icon'][href]");
 | 
						|
        if (appleIconNode != null)
 | 
						|
        {
 | 
						|
            return ResolveRelativeUrl(appleIconNode.GetAttribute("href"), baseUri);
 | 
						|
        }
 | 
						|
 | 
						|
        // Then check for standard favicon
 | 
						|
        var faviconNode = doc.QuerySelector("link[rel='icon'][href]") ??
 | 
						|
                          doc.QuerySelector("link[rel='shortcut icon'][href]");
 | 
						|
 | 
						|
        return faviconNode != null
 | 
						|
            ? ResolveRelativeUrl(faviconNode.GetAttribute("href"), baseUri)
 | 
						|
            : new Uri(baseUri, "/favicon.ico").ToString();
 | 
						|
    }
 | 
						|
 | 
						|
    private static string? ResolveRelativeUrl(string? url, Uri baseUri)
 | 
						|
    {
 | 
						|
        if (string.IsNullOrEmpty(url))
 | 
						|
        {
 | 
						|
            return null;
 | 
						|
        }
 | 
						|
 | 
						|
        if (Uri.TryCreate(url, UriKind.Absolute, out _))
 | 
						|
        {
 | 
						|
            return url; // Already absolute
 | 
						|
        }
 | 
						|
 | 
						|
        return Uri.TryCreate(baseUri, url, out var absoluteUri) ? absoluteUri.ToString() : null;
 | 
						|
    }
 | 
						|
 | 
						|
    /// <summary>
 | 
						|
    /// Generate a hash-based cache key for a URL
 | 
						|
    /// </summary>
 | 
						|
    private string GenerateUrlCacheKey(string url)
 | 
						|
    {
 | 
						|
        // Normalize the URL first
 | 
						|
        var normalizedUrl = NormalizeUrl(url);
 | 
						|
 | 
						|
        // Create SHA256 hash of the normalized URL
 | 
						|
        using var sha256 = System.Security.Cryptography.SHA256.Create();
 | 
						|
        var urlBytes = System.Text.Encoding.UTF8.GetBytes(normalizedUrl);
 | 
						|
        var hashBytes = sha256.ComputeHash(urlBytes);
 | 
						|
 | 
						|
        // Convert to hex string
 | 
						|
        var hashString = BitConverter.ToString(hashBytes).Replace("-", "").ToLowerInvariant();
 | 
						|
 | 
						|
        // Return prefixed key
 | 
						|
        return $"{LinkPreviewCachePrefix}{hashString}";
 | 
						|
    }
 | 
						|
 | 
						|
    /// <summary>
 | 
						|
    /// Normalize URL by trimming trailing slashes but preserving query parameters
 | 
						|
    /// </summary>
 | 
						|
    private string NormalizeUrl(string url)
 | 
						|
    {
 | 
						|
        if (string.IsNullOrEmpty(url))
 | 
						|
            return string.Empty;
 | 
						|
 | 
						|
        // First ensure we have a valid URI
 | 
						|
        if (!Uri.TryCreate(url, UriKind.Absolute, out var uri))
 | 
						|
            return url.TrimEnd('/');
 | 
						|
 | 
						|
        // Rebuild the URL without trailing slashes but with query parameters
 | 
						|
        var scheme = uri.Scheme;
 | 
						|
        var host = uri.Host;
 | 
						|
        var port = uri.IsDefaultPort ? string.Empty : $":{uri.Port}";
 | 
						|
        var path = uri.AbsolutePath.TrimEnd('/');
 | 
						|
        var query = uri.Query;
 | 
						|
 | 
						|
        return $"{scheme}://{host}{port}{path}{query}".ToLowerInvariant();
 | 
						|
    }
 | 
						|
 | 
						|
    /// <summary>
 | 
						|
    /// Cache a link preview
 | 
						|
    /// </summary>
 | 
						|
    private async Task CacheLinkPreview(LinkEmbed? linkEmbed, string url, TimeSpan? expiry = null)
 | 
						|
    {
 | 
						|
        if (linkEmbed == null || string.IsNullOrEmpty(url))
 | 
						|
            return;
 | 
						|
 | 
						|
        try
 | 
						|
        {
 | 
						|
            var cacheKey = GenerateUrlCacheKey(url);
 | 
						|
            var expiryTime = expiry ?? TimeSpan.FromHours(24);
 | 
						|
 | 
						|
            await cache.SetWithGroupsAsync(
 | 
						|
                cacheKey,
 | 
						|
                linkEmbed,
 | 
						|
                [LinkPreviewCacheGroup],
 | 
						|
                expiryTime);
 | 
						|
 | 
						|
            logger.LogDebug("Cached link preview for URL: {Url} with key: {CacheKey}", url, cacheKey);
 | 
						|
        }
 | 
						|
        catch (Exception ex)
 | 
						|
        {
 | 
						|
            // Log but don't throw - caching failures shouldn't break the main functionality
 | 
						|
            logger.LogWarning(ex, "Failed to cache link preview for URL: {Url}", url);
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    /// <summary>
 | 
						|
    /// Try to get a cached link preview
 | 
						|
    /// </summary>
 | 
						|
    private async Task<LinkEmbed?> GetCachedLinkPreview(string url)
 | 
						|
    {
 | 
						|
        if (string.IsNullOrEmpty(url))
 | 
						|
            return null;
 | 
						|
 | 
						|
        try
 | 
						|
        {
 | 
						|
            var cacheKey = GenerateUrlCacheKey(url);
 | 
						|
            var cachedPreview = await cache.GetAsync<LinkEmbed>(cacheKey);
 | 
						|
 | 
						|
            if (cachedPreview is not null)
 | 
						|
                logger.LogDebug("Retrieved cached link preview for URL: {Url}", url);
 | 
						|
 | 
						|
            return cachedPreview;
 | 
						|
        }
 | 
						|
        catch (Exception ex)
 | 
						|
        {
 | 
						|
            logger.LogWarning(ex, "Failed to retrieve cached link preview for URL: {Url}", url);
 | 
						|
            return null;
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    /// <summary>
 | 
						|
    /// Invalidate cache for a specific URL
 | 
						|
    /// </summary>
 | 
						|
    public async Task InvalidateCacheForUrlAsync(string url)
 | 
						|
    {
 | 
						|
        if (string.IsNullOrEmpty(url))
 | 
						|
            return;
 | 
						|
 | 
						|
        try
 | 
						|
        {
 | 
						|
            var cacheKey = GenerateUrlCacheKey(url);
 | 
						|
            await cache.RemoveAsync(cacheKey);
 | 
						|
            logger.LogDebug("Invalidated cache for URL: {Url} with key: {CacheKey}", url, cacheKey);
 | 
						|
        }
 | 
						|
        catch (Exception ex)
 | 
						|
        {
 | 
						|
            logger.LogWarning(ex, "Failed to invalidate cache for URL: {Url}", url);
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    /// <summary>
 | 
						|
    /// Invalidate all cached link previews
 | 
						|
    /// </summary>
 | 
						|
    public async Task InvalidateAllCachedPreviewsAsync()
 | 
						|
    {
 | 
						|
        try
 | 
						|
        {
 | 
						|
            await cache.RemoveGroupAsync(LinkPreviewCacheGroup);
 | 
						|
            logger.LogInformation("Invalidated all cached link previews");
 | 
						|
        }
 | 
						|
        catch (Exception ex)
 | 
						|
        {
 | 
						|
            logger.LogWarning(ex, "Failed to invalidate all cached link previews");
 | 
						|
        }
 | 
						|
    }
 | 
						|
} |