👔 Fetch less pages when no eager mode
This commit is contained in:
		| @@ -119,7 +119,7 @@ func newsSourceReadWordpress(src models.NewsSource, eager ...bool) ([]models.New | ||||
| 	return result, nil | ||||
| } | ||||
|  | ||||
| func newsSourceReadFeed(src models.NewsSource) ([]models.NewsArticle, error) { | ||||
| func newsSourceReadFeed(src models.NewsSource, eager ...bool) ([]models.NewsArticle, error) { | ||||
| 	pgConvert := func(article models.NewsArticle) models.NewsArticle { | ||||
| 		art := &article | ||||
| 		art.GenHash() | ||||
| @@ -133,8 +133,15 @@ func newsSourceReadFeed(src models.NewsSource) ([]models.NewsArticle, error) { | ||||
| 	fp := gofeed.NewParser() | ||||
| 	feed, _ := fp.ParseURLWithContext(src.Source, ctx) | ||||
|  | ||||
| 	maxPages := lo.Ternary(len(eager) > 0 && eager[0], len(feed.Items), src.Depth) | ||||
|  | ||||
| 	var result []models.NewsArticle | ||||
| 	for _, item := range feed.Items { | ||||
| 		if maxPages <= 0 { | ||||
| 			break | ||||
| 		} | ||||
|  | ||||
| 		maxPages-- | ||||
| 		parent := models.NewsArticle{ | ||||
| 			URL:         item.Link, | ||||
| 			Title:       item.Title, | ||||
| @@ -160,7 +167,7 @@ func newsSourceReadFeed(src models.NewsSource) ([]models.NewsArticle, error) { | ||||
| 	return result, nil | ||||
| } | ||||
|  | ||||
| func newsSourceReadScrap(src models.NewsSource) ([]models.NewsArticle, error) { | ||||
| func newsSourceReadScrap(src models.NewsSource, eager ...bool) ([]models.NewsArticle, error) { | ||||
| 	pgConvert := func(article models.NewsArticle) models.NewsArticle { | ||||
| 		art := &article | ||||
| 		art.GenHash() | ||||
| @@ -169,7 +176,8 @@ func newsSourceReadScrap(src models.NewsSource) ([]models.NewsArticle, error) { | ||||
| 		return article | ||||
| 	} | ||||
|  | ||||
| 	result := ScrapNewsIndex(src.Source) | ||||
| 	maxPages := lo.Ternary(len(eager) > 0 && eager[0], 0, src.Depth) | ||||
| 	result := ScrapNewsIndex(src.Source, maxPages) | ||||
|  | ||||
| 	for idx, page := range result { | ||||
| 		result[idx] = pgConvert(page) | ||||
|   | ||||
| @@ -128,7 +128,7 @@ func ScrapLink(target string) (*models.LinkMeta, error) { | ||||
|  | ||||
| const ScrapNewsDefaultUA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1.1 Safari/605.1.15" | ||||
|  | ||||
| func ScrapNewsIndex(target string) []models.NewsArticle { | ||||
| func ScrapNewsIndex(target string, maxPages ...int) []models.NewsArticle { | ||||
| 	parsedTarget, err := url.Parse(target) | ||||
| 	if err != nil { | ||||
| 		return nil | ||||
| @@ -140,6 +140,11 @@ func ScrapNewsIndex(target string) []models.NewsArticle { | ||||
| 		ua = ScrapNewsDefaultUA | ||||
| 	} | ||||
|  | ||||
| 	var limit int | ||||
| 	if len(maxPages) > 0 && maxPages[0] > 0 { | ||||
| 		limit = maxPages[0] | ||||
| 	} | ||||
|  | ||||
| 	c := colly.NewCollector( | ||||
| 		colly.UserAgent(ua), | ||||
| 		colly.MaxDepth(3), | ||||
| @@ -160,6 +165,10 @@ func ScrapNewsIndex(target string) []models.NewsArticle { | ||||
| 	var result []models.NewsArticle | ||||
|  | ||||
| 	c.OnHTML("main a", func(e *colly.HTMLElement) { | ||||
| 		if limit <= 0 { | ||||
| 			return | ||||
| 		} | ||||
|  | ||||
| 		url := e.Attr("href") | ||||
| 		if strings.HasPrefix(url, "#") || strings.HasPrefix(url, "javascript:") || strings.HasPrefix(url, "mailto:") { | ||||
| 			return | ||||
| @@ -168,6 +177,7 @@ func ScrapNewsIndex(target string) []models.NewsArticle { | ||||
| 			url = fmt.Sprintf("%s%s", baseUrl, url) | ||||
| 		} | ||||
|  | ||||
| 		limit-- | ||||
| 		article, err := ScrapNews(url) | ||||
| 		if err != nil { | ||||
| 			log.Warn().Err(err).Str("url", url).Msg("Failed to scrap a news article...") | ||||
|   | ||||
		Reference in New Issue
	
	Block a user