Bug fixes on web feed & scraping

This commit is contained in:
LittleSheep 2025-06-30 23:26:05 +08:00
parent ca5be5a01c
commit ccb8a4e3f4
4 changed files with 16 additions and 6 deletions

View File

@ -31,8 +31,8 @@ public class WebFeedConfig
public class WebFeed : ModelBase public class WebFeed : ModelBase
{ {
public Guid Id { get; set; } = Guid.NewGuid(); public Guid Id { get; set; } = Guid.NewGuid();
[MaxLength(8192)] public string Url { get; set; } [MaxLength(8192)] public string Url { get; set; } = null!;
[MaxLength(4096)] public string Title { get; set; } [MaxLength(4096)] public string Title { get; set; } = null!;
[MaxLength(8192)] public string? Description { get; set; } [MaxLength(8192)] public string? Description { get; set; }
[Column(TypeName = "jsonb")] public LinkEmbed? Preview { get; set; } [Column(TypeName = "jsonb")] public LinkEmbed? Preview { get; set; }

View File

@ -13,7 +13,8 @@ public class WebFeedController(WebFeedService webFeed, PublisherService ps) : Co
public record WebFeedRequest( public record WebFeedRequest(
[MaxLength(8192)] string? Url, [MaxLength(8192)] string? Url,
[MaxLength(4096)] string? Title, [MaxLength(4096)] string? Title,
[MaxLength(8192)] string? Description [MaxLength(8192)] string? Description,
WebFeedConfig? Config
); );
[HttpGet] [HttpGet]

View File

@ -11,13 +11,15 @@ public class WebFeedService(
WebReaderService webReaderService WebReaderService webReaderService
) )
{ {
public async Task<WebFeed> CreateWebFeedAsync(Publisher.Publisher publisher, WebFeedController.WebFeedRequest request) public async Task<WebFeed> CreateWebFeedAsync(Publisher.Publisher publisher,
WebFeedController.WebFeedRequest request)
{ {
var feed = new WebFeed var feed = new WebFeed
{ {
Url = request.Url!, Url = request.Url!,
Title = request.Title!, Title = request.Title!,
Description = request.Description, Description = request.Description,
Config = request.Config ?? new WebFeedConfig(),
PublisherId = publisher.Id, PublisherId = publisher.Id,
}; };
@ -48,6 +50,8 @@ public class WebFeedService(
feed.Title = request.Title; feed.Title = request.Title;
if (request.Description is not null) if (request.Description is not null)
feed.Description = request.Description; feed.Description = request.Description;
if (request.Config is not null)
feed.Config = request.Config;
database.Update(feed); database.Update(feed);
await database.SaveChangesAsync(); await database.SaveChangesAsync();

View File

@ -33,7 +33,11 @@ public class WebReaderService(
{ {
var httpClient = httpClientFactory.CreateClient("WebReader"); var httpClient = httpClientFactory.CreateClient("WebReader");
var response = await httpClient.GetAsync(url, cancellationToken); var response = await httpClient.GetAsync(url, cancellationToken);
response.EnsureSuccessStatusCode(); if (!response.IsSuccessStatusCode)
{
logger.LogWarning("Failed to scrap article content for URL: {Url}", url);
return null;
}
var html = await response.Content.ReadAsStringAsync(cancellationToken); var html = await response.Content.ReadAsStringAsync(cancellationToken);
var doc = new HtmlDocument(); var doc = new HtmlDocument();
doc.LoadHtml(html); doc.LoadHtml(html);
@ -74,7 +78,8 @@ public class WebReaderService(
// Cache miss or bypass, fetch fresh data // Cache miss or bypass, fetch fresh data
logger.LogDebug("Fetching fresh link preview for URL: {Url}", url); logger.LogDebug("Fetching fresh link preview for URL: {Url}", url);
var httpClient = httpClientFactory.CreateClient("WebReader"); var httpClient = httpClientFactory.CreateClient("WebReader");
httpClient.MaxResponseContentBufferSize = 10 * 1024 * 1024; // 10MB, prevent scrap some directly accessible files httpClient.MaxResponseContentBufferSize =
10 * 1024 * 1024; // 10MB, prevent scrap some directly accessible files
httpClient.Timeout = TimeSpan.FromSeconds(3); httpClient.Timeout = TimeSpan.FromSeconds(3);
// Setting UA to facebook's bot to get the opengraph. // Setting UA to facebook's bot to get the opengraph.
httpClient.DefaultRequestHeaders.Add("User-Agent", "facebookexternalhit/1.1"); httpClient.DefaultRequestHeaders.Add("User-Agent", "facebookexternalhit/1.1");