diff --git a/pkg/internal/services/reader.go b/pkg/internal/services/reader.go index 033874a..e952a10 100644 --- a/pkg/internal/services/reader.go +++ b/pkg/internal/services/reader.go @@ -119,7 +119,7 @@ func newsSourceReadWordpress(src models.NewsSource, eager ...bool) ([]models.New return result, nil } -func newsSourceReadFeed(src models.NewsSource) ([]models.NewsArticle, error) { +func newsSourceReadFeed(src models.NewsSource, eager ...bool) ([]models.NewsArticle, error) { pgConvert := func(article models.NewsArticle) models.NewsArticle { art := &article art.GenHash() @@ -133,8 +133,15 @@ func newsSourceReadFeed(src models.NewsSource) ([]models.NewsArticle, error) { fp := gofeed.NewParser() feed, _ := fp.ParseURLWithContext(src.Source, ctx) + maxPages := lo.Ternary(len(eager) > 0 && eager[0], len(feed.Items), src.Depth) + var result []models.NewsArticle for _, item := range feed.Items { + if maxPages <= 0 { + break + } + + maxPages-- parent := models.NewsArticle{ URL: item.Link, Title: item.Title, @@ -160,7 +167,7 @@ func newsSourceReadFeed(src models.NewsSource) ([]models.NewsArticle, error) { return result, nil } -func newsSourceReadScrap(src models.NewsSource) ([]models.NewsArticle, error) { +func newsSourceReadScrap(src models.NewsSource, eager ...bool) ([]models.NewsArticle, error) { pgConvert := func(article models.NewsArticle) models.NewsArticle { art := &article art.GenHash() @@ -169,7 +176,8 @@ func newsSourceReadScrap(src models.NewsSource) ([]models.NewsArticle, error) { return article } - result := ScrapNewsIndex(src.Source) + maxPages := lo.Ternary(len(eager) > 0 && eager[0], 0, src.Depth) + result := ScrapNewsIndex(src.Source, maxPages) for idx, page := range result { result[idx] = pgConvert(page) diff --git a/pkg/internal/services/scraper.go b/pkg/internal/services/scraper.go index 21b07c8..5414aa1 100644 --- a/pkg/internal/services/scraper.go +++ b/pkg/internal/services/scraper.go @@ -128,7 +128,7 @@ func ScrapLink(target string) (*models.LinkMeta, error) { const ScrapNewsDefaultUA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1.1 Safari/605.1.15" -func ScrapNewsIndex(target string) []models.NewsArticle { +func ScrapNewsIndex(target string, maxPages ...int) []models.NewsArticle { parsedTarget, err := url.Parse(target) if err != nil { return nil @@ -140,6 +140,11 @@ func ScrapNewsIndex(target string) []models.NewsArticle { ua = ScrapNewsDefaultUA } + var limit int + if len(maxPages) > 0 && maxPages[0] > 0 { + limit = maxPages[0] + } + c := colly.NewCollector( colly.UserAgent(ua), colly.MaxDepth(3), @@ -160,6 +165,10 @@ func ScrapNewsIndex(target string) []models.NewsArticle { var result []models.NewsArticle c.OnHTML("main a", func(e *colly.HTMLElement) { + if limit <= 0 { + return + } + url := e.Attr("href") if strings.HasPrefix(url, "#") || strings.HasPrefix(url, "javascript:") || strings.HasPrefix(url, "mailto:") { return @@ -168,6 +177,7 @@ func ScrapNewsIndex(target string) []models.NewsArticle { url = fmt.Sprintf("%s%s", baseUrl, url) } + limit-- article, err := ScrapNews(url) if err != nil { log.Warn().Err(err).Str("url", url).Msg("Failed to scrap a news article...")