👔 Fetch less pages when no eager mode
This commit is contained in:
parent
2a58d185eb
commit
1867b34735
@ -119,7 +119,7 @@ func newsSourceReadWordpress(src models.NewsSource, eager ...bool) ([]models.New
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func newsSourceReadFeed(src models.NewsSource) ([]models.NewsArticle, error) {
|
||||
func newsSourceReadFeed(src models.NewsSource, eager ...bool) ([]models.NewsArticle, error) {
|
||||
pgConvert := func(article models.NewsArticle) models.NewsArticle {
|
||||
art := &article
|
||||
art.GenHash()
|
||||
@ -133,8 +133,15 @@ func newsSourceReadFeed(src models.NewsSource) ([]models.NewsArticle, error) {
|
||||
fp := gofeed.NewParser()
|
||||
feed, _ := fp.ParseURLWithContext(src.Source, ctx)
|
||||
|
||||
maxPages := lo.Ternary(len(eager) > 0 && eager[0], len(feed.Items), src.Depth)
|
||||
|
||||
var result []models.NewsArticle
|
||||
for _, item := range feed.Items {
|
||||
if maxPages <= 0 {
|
||||
break
|
||||
}
|
||||
|
||||
maxPages--
|
||||
parent := models.NewsArticle{
|
||||
URL: item.Link,
|
||||
Title: item.Title,
|
||||
@ -160,7 +167,7 @@ func newsSourceReadFeed(src models.NewsSource) ([]models.NewsArticle, error) {
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func newsSourceReadScrap(src models.NewsSource) ([]models.NewsArticle, error) {
|
||||
func newsSourceReadScrap(src models.NewsSource, eager ...bool) ([]models.NewsArticle, error) {
|
||||
pgConvert := func(article models.NewsArticle) models.NewsArticle {
|
||||
art := &article
|
||||
art.GenHash()
|
||||
@ -169,7 +176,8 @@ func newsSourceReadScrap(src models.NewsSource) ([]models.NewsArticle, error) {
|
||||
return article
|
||||
}
|
||||
|
||||
result := ScrapNewsIndex(src.Source)
|
||||
maxPages := lo.Ternary(len(eager) > 0 && eager[0], 0, src.Depth)
|
||||
result := ScrapNewsIndex(src.Source, maxPages)
|
||||
|
||||
for idx, page := range result {
|
||||
result[idx] = pgConvert(page)
|
||||
|
@ -128,7 +128,7 @@ func ScrapLink(target string) (*models.LinkMeta, error) {
|
||||
|
||||
const ScrapNewsDefaultUA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1.1 Safari/605.1.15"
|
||||
|
||||
func ScrapNewsIndex(target string) []models.NewsArticle {
|
||||
func ScrapNewsIndex(target string, maxPages ...int) []models.NewsArticle {
|
||||
parsedTarget, err := url.Parse(target)
|
||||
if err != nil {
|
||||
return nil
|
||||
@ -140,6 +140,11 @@ func ScrapNewsIndex(target string) []models.NewsArticle {
|
||||
ua = ScrapNewsDefaultUA
|
||||
}
|
||||
|
||||
var limit int
|
||||
if len(maxPages) > 0 && maxPages[0] > 0 {
|
||||
limit = maxPages[0]
|
||||
}
|
||||
|
||||
c := colly.NewCollector(
|
||||
colly.UserAgent(ua),
|
||||
colly.MaxDepth(3),
|
||||
@ -160,6 +165,10 @@ func ScrapNewsIndex(target string) []models.NewsArticle {
|
||||
var result []models.NewsArticle
|
||||
|
||||
c.OnHTML("main a", func(e *colly.HTMLElement) {
|
||||
if limit <= 0 {
|
||||
return
|
||||
}
|
||||
|
||||
url := e.Attr("href")
|
||||
if strings.HasPrefix(url, "#") || strings.HasPrefix(url, "javascript:") || strings.HasPrefix(url, "mailto:") {
|
||||
return
|
||||
@ -168,6 +177,7 @@ func ScrapNewsIndex(target string) []models.NewsArticle {
|
||||
url = fmt.Sprintf("%s%s", baseUrl, url)
|
||||
}
|
||||
|
||||
limit--
|
||||
article, err := ScrapNews(url)
|
||||
if err != nil {
|
||||
log.Warn().Err(err).Str("url", url).Msg("Failed to scrap a news article...")
|
||||
|
Loading…
x
Reference in New Issue
Block a user