Bug fixes on web feed & scraping

This commit is contained in:
LittleSheep 2025-06-30 23:26:05 +08:00
parent ca5be5a01c
commit ccb8a4e3f4
4 changed files with 16 additions and 6 deletions

View File

@ -31,8 +31,8 @@ public class WebFeedConfig
public class WebFeed : ModelBase
{
public Guid Id { get; set; } = Guid.NewGuid();
[MaxLength(8192)] public string Url { get; set; }
[MaxLength(4096)] public string Title { get; set; }
[MaxLength(8192)] public string Url { get; set; } = null!;
[MaxLength(4096)] public string Title { get; set; } = null!;
[MaxLength(8192)] public string? Description { get; set; }
[Column(TypeName = "jsonb")] public LinkEmbed? Preview { get; set; }

View File

@ -13,7 +13,8 @@ public class WebFeedController(WebFeedService webFeed, PublisherService ps) : Co
public record WebFeedRequest(
[MaxLength(8192)] string? Url,
[MaxLength(4096)] string? Title,
[MaxLength(8192)] string? Description
[MaxLength(8192)] string? Description,
WebFeedConfig? Config
);
[HttpGet]

View File

@ -11,13 +11,15 @@ public class WebFeedService(
WebReaderService webReaderService
)
{
public async Task<WebFeed> CreateWebFeedAsync(Publisher.Publisher publisher, WebFeedController.WebFeedRequest request)
public async Task<WebFeed> CreateWebFeedAsync(Publisher.Publisher publisher,
WebFeedController.WebFeedRequest request)
{
var feed = new WebFeed
{
Url = request.Url!,
Title = request.Title!,
Description = request.Description,
Config = request.Config ?? new WebFeedConfig(),
PublisherId = publisher.Id,
};
@ -48,6 +50,8 @@ public class WebFeedService(
feed.Title = request.Title;
if (request.Description is not null)
feed.Description = request.Description;
if (request.Config is not null)
feed.Config = request.Config;
database.Update(feed);
await database.SaveChangesAsync();

View File

@ -33,7 +33,11 @@ public class WebReaderService(
{
var httpClient = httpClientFactory.CreateClient("WebReader");
var response = await httpClient.GetAsync(url, cancellationToken);
response.EnsureSuccessStatusCode();
if (!response.IsSuccessStatusCode)
{
logger.LogWarning("Failed to scrap article content for URL: {Url}", url);
return null;
}
var html = await response.Content.ReadAsStringAsync(cancellationToken);
var doc = new HtmlDocument();
doc.LoadHtml(html);
@ -74,7 +78,8 @@ public class WebReaderService(
// Cache miss or bypass, fetch fresh data
logger.LogDebug("Fetching fresh link preview for URL: {Url}", url);
var httpClient = httpClientFactory.CreateClient("WebReader");
httpClient.MaxResponseContentBufferSize = 10 * 1024 * 1024; // 10MB, prevent scrap some directly accessible files
httpClient.MaxResponseContentBufferSize =
10 * 1024 * 1024; // 10MB, prevent scrap some directly accessible files
httpClient.Timeout = TimeSpan.FromSeconds(3);
// Setting UA to facebook's bot to get the opengraph.
httpClient.DefaultRequestHeaders.Add("User-Agent", "facebookexternalhit/1.1");