365 lines
13 KiB
C#
365 lines
13 KiB
C#
using System.Globalization;
|
|
using AngleSharp;
|
|
using AngleSharp.Dom;
|
|
using DysonNetwork.Sphere.Storage;
|
|
using HtmlAgilityPack;
|
|
|
|
namespace DysonNetwork.Sphere.Connection.WebReader;
|
|
|
|
/// <summary>
|
|
/// The service is amin to providing scrapping service to the Solar Network.
|
|
/// Such as news feed, external articles and link preview.
|
|
/// </summary>
|
|
public class WebReaderService(
|
|
IHttpClientFactory httpClientFactory,
|
|
ILogger<WebReaderService> logger,
|
|
ICacheService cache)
|
|
{
|
|
private const string LinkPreviewCachePrefix = "scrap:preview:";
|
|
private const string LinkPreviewCacheGroup = "scrap:preview";
|
|
|
|
public async Task<ScrapedArticle> ScrapeArticleAsync(string url, CancellationToken cancellationToken = default)
|
|
{
|
|
var linkEmbed = await GetLinkPreviewAsync(url, cancellationToken);
|
|
var content = await GetArticleContentAsync(url, cancellationToken);
|
|
return new ScrapedArticle
|
|
{
|
|
LinkEmbed = linkEmbed,
|
|
Content = content
|
|
};
|
|
}
|
|
|
|
private async Task<string?> GetArticleContentAsync(string url, CancellationToken cancellationToken)
|
|
{
|
|
var httpClient = httpClientFactory.CreateClient("WebReader");
|
|
var response = await httpClient.GetAsync(url, cancellationToken);
|
|
if (!response.IsSuccessStatusCode)
|
|
{
|
|
logger.LogWarning("Failed to scrap article content for URL: {Url}", url);
|
|
return null;
|
|
}
|
|
var html = await response.Content.ReadAsStringAsync(cancellationToken);
|
|
var doc = new HtmlDocument();
|
|
doc.LoadHtml(html);
|
|
var articleNode = doc.DocumentNode.SelectSingleNode("//article");
|
|
return articleNode?.InnerHtml;
|
|
}
|
|
|
|
|
|
/// <summary>
|
|
/// Generate a link preview embed from a URL
|
|
/// </summary>
|
|
/// <param name="url">The URL to generate the preview for</param>
|
|
/// <param name="cancellationToken">Cancellation token</param>
|
|
/// <param name="bypassCache">If true, bypass cache and fetch fresh data</param>
|
|
/// <param name="cacheExpiry">Custom cache expiration time</param>
|
|
/// <returns>A LinkEmbed object containing the preview data</returns>
|
|
public async Task<LinkEmbed> GetLinkPreviewAsync(
|
|
string url,
|
|
CancellationToken cancellationToken = default,
|
|
TimeSpan? cacheExpiry = null,
|
|
bool bypassCache = false
|
|
)
|
|
{
|
|
// Ensure URL is valid
|
|
if (!Uri.TryCreate(url, UriKind.Absolute, out var uri))
|
|
{
|
|
throw new ArgumentException(@"Invalid URL format", nameof(url));
|
|
}
|
|
|
|
// Try to get from cache if not bypassing
|
|
if (!bypassCache)
|
|
{
|
|
var cachedPreview = await GetCachedLinkPreview(url);
|
|
if (cachedPreview is not null)
|
|
return cachedPreview;
|
|
}
|
|
|
|
// Cache miss or bypass, fetch fresh data
|
|
logger.LogDebug("Fetching fresh link preview for URL: {Url}", url);
|
|
var httpClient = httpClientFactory.CreateClient("WebReader");
|
|
httpClient.MaxResponseContentBufferSize =
|
|
10 * 1024 * 1024; // 10MB, prevent scrap some directly accessible files
|
|
httpClient.Timeout = TimeSpan.FromSeconds(3);
|
|
// Setting UA to facebook's bot to get the opengraph.
|
|
httpClient.DefaultRequestHeaders.Add("User-Agent", "facebookexternalhit/1.1");
|
|
|
|
try
|
|
{
|
|
var response = await httpClient.GetAsync(url, cancellationToken);
|
|
response.EnsureSuccessStatusCode();
|
|
|
|
var contentType = response.Content.Headers.ContentType?.MediaType;
|
|
if (contentType == null || !contentType.StartsWith("text/html"))
|
|
{
|
|
logger.LogWarning("URL is not an HTML page: {Url}, ContentType: {ContentType}", url, contentType);
|
|
var nonHtmlEmbed = new LinkEmbed
|
|
{
|
|
Url = url,
|
|
Title = uri.Host,
|
|
ContentType = contentType
|
|
};
|
|
|
|
// Cache non-HTML responses too
|
|
await CacheLinkPreview(nonHtmlEmbed, url, cacheExpiry);
|
|
return nonHtmlEmbed;
|
|
}
|
|
|
|
var html = await response.Content.ReadAsStringAsync(cancellationToken);
|
|
var linkEmbed = await ExtractLinkData(url, html, uri);
|
|
|
|
// Cache the result
|
|
await CacheLinkPreview(linkEmbed, url, cacheExpiry);
|
|
|
|
return linkEmbed;
|
|
}
|
|
catch (HttpRequestException ex)
|
|
{
|
|
logger.LogError(ex, "Failed to fetch URL: {Url}", url);
|
|
throw new WebReaderException($"Failed to fetch URL: {url}", ex);
|
|
}
|
|
}
|
|
|
|
private async Task<LinkEmbed> ExtractLinkData(string url, string html, Uri uri)
|
|
{
|
|
var embed = new LinkEmbed
|
|
{
|
|
Url = url
|
|
};
|
|
|
|
// Configure AngleSharp context
|
|
var config = Configuration.Default;
|
|
var context = BrowsingContext.New(config);
|
|
var document = await context.OpenAsync(req => req.Content(html));
|
|
|
|
// Extract OpenGraph tags
|
|
var ogTitle = GetMetaTagContent(document, "og:title");
|
|
var ogDescription = GetMetaTagContent(document, "og:description");
|
|
var ogImage = GetMetaTagContent(document, "og:image");
|
|
var ogSiteName = GetMetaTagContent(document, "og:site_name");
|
|
var ogType = GetMetaTagContent(document, "og:type");
|
|
|
|
// Extract Twitter card tags as fallback
|
|
var twitterTitle = GetMetaTagContent(document, "twitter:title");
|
|
var twitterDescription = GetMetaTagContent(document, "twitter:description");
|
|
var twitterImage = GetMetaTagContent(document, "twitter:image");
|
|
|
|
// Extract standard meta tags as final fallback
|
|
var metaTitle = GetMetaTagContent(document, "title") ??
|
|
GetMetaContent(document, "title");
|
|
var metaDescription = GetMetaTagContent(document, "description");
|
|
|
|
// Extract page title
|
|
var pageTitle = document.Title?.Trim();
|
|
|
|
// Extract publish date
|
|
var publishedTime = GetMetaTagContent(document, "article:published_time") ??
|
|
GetMetaTagContent(document, "datePublished") ??
|
|
GetMetaTagContent(document, "pubdate");
|
|
|
|
// Extract author
|
|
var author = GetMetaTagContent(document, "author") ??
|
|
GetMetaTagContent(document, "article:author");
|
|
|
|
// Extract favicon
|
|
var faviconUrl = GetFaviconUrl(document, uri);
|
|
|
|
// Populate the embed with the data, prioritizing OpenGraph
|
|
embed.Title = ogTitle ?? twitterTitle ?? metaTitle ?? pageTitle ?? uri.Host;
|
|
embed.Description = ogDescription ?? twitterDescription ?? metaDescription;
|
|
embed.ImageUrl = ResolveRelativeUrl(ogImage ?? twitterImage, uri);
|
|
embed.SiteName = ogSiteName ?? uri.Host;
|
|
embed.ContentType = ogType;
|
|
embed.FaviconUrl = faviconUrl;
|
|
embed.Author = author;
|
|
|
|
// Parse and set published date
|
|
if (!string.IsNullOrEmpty(publishedTime) &&
|
|
DateTime.TryParse(publishedTime, CultureInfo.InvariantCulture, DateTimeStyles.AdjustToUniversal,
|
|
out DateTime parsedDate))
|
|
{
|
|
embed.PublishedDate = parsedDate;
|
|
}
|
|
|
|
return embed;
|
|
}
|
|
|
|
private static string? GetMetaTagContent(IDocument doc, string property)
|
|
{
|
|
// Check for OpenGraph/Twitter style meta tags
|
|
var node = doc.QuerySelector($"meta[property='{property}'][content]")
|
|
?? doc.QuerySelector($"meta[name='{property}'][content]");
|
|
|
|
return node?.GetAttribute("content")?.Trim();
|
|
}
|
|
|
|
private static string? GetMetaContent(IDocument doc, string name)
|
|
{
|
|
var node = doc.QuerySelector($"meta[name='{name}'][content]");
|
|
return node?.GetAttribute("content")?.Trim();
|
|
}
|
|
|
|
private static string? GetFaviconUrl(IDocument doc, Uri baseUri)
|
|
{
|
|
// Look for apple-touch-icon first as it's typically higher quality
|
|
var appleIconNode = doc.QuerySelector("link[rel='apple-touch-icon'][href]");
|
|
if (appleIconNode != null)
|
|
{
|
|
return ResolveRelativeUrl(appleIconNode.GetAttribute("href"), baseUri);
|
|
}
|
|
|
|
// Then check for standard favicon
|
|
var faviconNode = doc.QuerySelector("link[rel='icon'][href]") ??
|
|
doc.QuerySelector("link[rel='shortcut icon'][href]");
|
|
|
|
return faviconNode != null
|
|
? ResolveRelativeUrl(faviconNode.GetAttribute("href"), baseUri)
|
|
: new Uri(baseUri, "/favicon.ico").ToString();
|
|
}
|
|
|
|
private static string? ResolveRelativeUrl(string? url, Uri baseUri)
|
|
{
|
|
if (string.IsNullOrEmpty(url))
|
|
{
|
|
return null;
|
|
}
|
|
|
|
if (Uri.TryCreate(url, UriKind.Absolute, out _))
|
|
{
|
|
return url; // Already absolute
|
|
}
|
|
|
|
return Uri.TryCreate(baseUri, url, out var absoluteUri) ? absoluteUri.ToString() : null;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Generate a hash-based cache key for a URL
|
|
/// </summary>
|
|
private string GenerateUrlCacheKey(string url)
|
|
{
|
|
// Normalize the URL first
|
|
var normalizedUrl = NormalizeUrl(url);
|
|
|
|
// Create SHA256 hash of the normalized URL
|
|
using var sha256 = System.Security.Cryptography.SHA256.Create();
|
|
var urlBytes = System.Text.Encoding.UTF8.GetBytes(normalizedUrl);
|
|
var hashBytes = sha256.ComputeHash(urlBytes);
|
|
|
|
// Convert to hex string
|
|
var hashString = BitConverter.ToString(hashBytes).Replace("-", "").ToLowerInvariant();
|
|
|
|
// Return prefixed key
|
|
return $"{LinkPreviewCachePrefix}{hashString}";
|
|
}
|
|
|
|
/// <summary>
|
|
/// Normalize URL by trimming trailing slashes but preserving query parameters
|
|
/// </summary>
|
|
private string NormalizeUrl(string url)
|
|
{
|
|
if (string.IsNullOrEmpty(url))
|
|
return string.Empty;
|
|
|
|
// First ensure we have a valid URI
|
|
if (!Uri.TryCreate(url, UriKind.Absolute, out var uri))
|
|
return url.TrimEnd('/');
|
|
|
|
// Rebuild the URL without trailing slashes but with query parameters
|
|
var scheme = uri.Scheme;
|
|
var host = uri.Host;
|
|
var port = uri.IsDefaultPort ? string.Empty : $":{uri.Port}";
|
|
var path = uri.AbsolutePath.TrimEnd('/');
|
|
var query = uri.Query;
|
|
|
|
return $"{scheme}://{host}{port}{path}{query}".ToLowerInvariant();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Cache a link preview
|
|
/// </summary>
|
|
private async Task CacheLinkPreview(LinkEmbed? linkEmbed, string url, TimeSpan? expiry = null)
|
|
{
|
|
if (linkEmbed == null || string.IsNullOrEmpty(url))
|
|
return;
|
|
|
|
try
|
|
{
|
|
var cacheKey = GenerateUrlCacheKey(url);
|
|
var expiryTime = expiry ?? TimeSpan.FromHours(24);
|
|
|
|
await cache.SetWithGroupsAsync(
|
|
cacheKey,
|
|
linkEmbed,
|
|
[LinkPreviewCacheGroup],
|
|
expiryTime);
|
|
|
|
logger.LogDebug("Cached link preview for URL: {Url} with key: {CacheKey}", url, cacheKey);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
// Log but don't throw - caching failures shouldn't break the main functionality
|
|
logger.LogWarning(ex, "Failed to cache link preview for URL: {Url}", url);
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Try to get a cached link preview
|
|
/// </summary>
|
|
private async Task<LinkEmbed?> GetCachedLinkPreview(string url)
|
|
{
|
|
if (string.IsNullOrEmpty(url))
|
|
return null;
|
|
|
|
try
|
|
{
|
|
var cacheKey = GenerateUrlCacheKey(url);
|
|
var cachedPreview = await cache.GetAsync<LinkEmbed>(cacheKey);
|
|
|
|
if (cachedPreview is not null)
|
|
logger.LogDebug("Retrieved cached link preview for URL: {Url}", url);
|
|
|
|
return cachedPreview;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
logger.LogWarning(ex, "Failed to retrieve cached link preview for URL: {Url}", url);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Invalidate cache for a specific URL
|
|
/// </summary>
|
|
public async Task InvalidateCacheForUrlAsync(string url)
|
|
{
|
|
if (string.IsNullOrEmpty(url))
|
|
return;
|
|
|
|
try
|
|
{
|
|
var cacheKey = GenerateUrlCacheKey(url);
|
|
await cache.RemoveAsync(cacheKey);
|
|
logger.LogDebug("Invalidated cache for URL: {Url} with key: {CacheKey}", url, cacheKey);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
logger.LogWarning(ex, "Failed to invalidate cache for URL: {Url}", url);
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Invalidate all cached link previews
|
|
/// </summary>
|
|
public async Task InvalidateAllCachedPreviewsAsync()
|
|
{
|
|
try
|
|
{
|
|
await cache.RemoveGroupAsync(LinkPreviewCacheGroup);
|
|
logger.LogInformation("Invalidated all cached link previews");
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
logger.LogWarning(ex, "Failed to invalidate all cached link previews");
|
|
}
|
|
}
|
|
} |