Compare commits

..

2 Commits

3 changed files with 61 additions and 8 deletions

View File

@ -2,9 +2,14 @@ package api
import ( import (
"git.solsynth.dev/hypernet/nexus/pkg/nex/sec" "git.solsynth.dev/hypernet/nexus/pkg/nex/sec"
"git.solsynth.dev/hypernet/reader/pkg/internal/database"
"git.solsynth.dev/hypernet/reader/pkg/internal/models"
"git.solsynth.dev/hypernet/reader/pkg/internal/server/exts" "git.solsynth.dev/hypernet/reader/pkg/internal/server/exts"
"git.solsynth.dev/hypernet/reader/pkg/internal/services" "git.solsynth.dev/hypernet/reader/pkg/internal/services"
"github.com/gofiber/fiber/v2" "github.com/gofiber/fiber/v2"
"github.com/rs/zerolog/log"
"github.com/samber/lo"
"gorm.io/gorm/clause"
) )
func adminTriggerScanTask(c *fiber.Ctx) error { func adminTriggerScanTask(c *fiber.Ctx) error {
@ -13,13 +18,43 @@ func adminTriggerScanTask(c *fiber.Ctx) error {
} }
var data struct { var data struct {
Eager bool `json:"eager"` Eager bool `json:"eager"`
Sources []string `json:"sources"`
} }
if err := exts.BindAndValidate(c, &data); err != nil { if err := exts.BindAndValidate(c, &data); err != nil {
return err return err
} }
go services.ScanNewsSources(data.Eager) go func() {
count := 0
for _, src := range services.NewsSources {
if !src.Enabled {
continue
}
if len(data.Sources) > 0 && !lo.Contains(data.Sources, src.ID) {
continue
}
log.Debug().Str("source", src.ID).Msg("Scanning news source...")
result, err := services.NewsSourceRead(src, data.Eager)
if err != nil {
log.Warn().Err(err).Str("source", src.ID).Msg("Failed to scan a news source.")
}
result = lo.UniqBy(result, func(item models.NewsArticle) string {
return item.Hash
})
database.C.Clauses(clause.OnConflict{
UpdateAll: true,
}).Create(&result)
log.Info().Str("source", src.ID).Int("count", len(result)).Msg("Scanned a news sources.")
count += len(result)
}
log.Info().Int("count", count).Msg("Scanned all news sources.")
}()
return c.SendStatus(fiber.StatusOK) return c.SendStatus(fiber.StatusOK)
} }

View File

@ -62,9 +62,9 @@ func NewsSourceRead(src models.NewsSource, eager ...bool) ([]models.NewsArticle,
case "wordpress": case "wordpress":
return newsSourceReadWordpress(src, eager...) return newsSourceReadWordpress(src, eager...)
case "scrap": case "scrap":
return newsSourceReadScrap(src) return newsSourceReadScrap(src, eager...)
case "feed": case "feed":
return newsSourceReadFeed(src) return newsSourceReadFeed(src, eager...)
default: default:
return nil, fmt.Errorf("unsupported news source type: %s", src.Type) return nil, fmt.Errorf("unsupported news source type: %s", src.Type)
} }
@ -119,7 +119,7 @@ func newsSourceReadWordpress(src models.NewsSource, eager ...bool) ([]models.New
return result, nil return result, nil
} }
func newsSourceReadFeed(src models.NewsSource) ([]models.NewsArticle, error) { func newsSourceReadFeed(src models.NewsSource, eager ...bool) ([]models.NewsArticle, error) {
pgConvert := func(article models.NewsArticle) models.NewsArticle { pgConvert := func(article models.NewsArticle) models.NewsArticle {
art := &article art := &article
art.GenHash() art.GenHash()
@ -133,8 +133,15 @@ func newsSourceReadFeed(src models.NewsSource) ([]models.NewsArticle, error) {
fp := gofeed.NewParser() fp := gofeed.NewParser()
feed, _ := fp.ParseURLWithContext(src.Source, ctx) feed, _ := fp.ParseURLWithContext(src.Source, ctx)
maxPages := lo.Ternary(len(eager) > 0 && eager[0], len(feed.Items), src.Depth)
var result []models.NewsArticle var result []models.NewsArticle
for _, item := range feed.Items { for _, item := range feed.Items {
if maxPages <= 0 {
break
}
maxPages--
parent := models.NewsArticle{ parent := models.NewsArticle{
URL: item.Link, URL: item.Link,
Title: item.Title, Title: item.Title,
@ -160,7 +167,7 @@ func newsSourceReadFeed(src models.NewsSource) ([]models.NewsArticle, error) {
return result, nil return result, nil
} }
func newsSourceReadScrap(src models.NewsSource) ([]models.NewsArticle, error) { func newsSourceReadScrap(src models.NewsSource, eager ...bool) ([]models.NewsArticle, error) {
pgConvert := func(article models.NewsArticle) models.NewsArticle { pgConvert := func(article models.NewsArticle) models.NewsArticle {
art := &article art := &article
art.GenHash() art.GenHash()
@ -169,7 +176,8 @@ func newsSourceReadScrap(src models.NewsSource) ([]models.NewsArticle, error) {
return article return article
} }
result := ScrapNewsIndex(src.Source) maxPages := lo.Ternary(len(eager) > 0 && eager[0], 0, src.Depth)
result := ScrapNewsIndex(src.Source, maxPages)
for idx, page := range result { for idx, page := range result {
result[idx] = pgConvert(page) result[idx] = pgConvert(page)

View File

@ -128,7 +128,7 @@ func ScrapLink(target string) (*models.LinkMeta, error) {
const ScrapNewsDefaultUA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1.1 Safari/605.1.15" const ScrapNewsDefaultUA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1.1 Safari/605.1.15"
func ScrapNewsIndex(target string) []models.NewsArticle { func ScrapNewsIndex(target string, maxPages ...int) []models.NewsArticle {
parsedTarget, err := url.Parse(target) parsedTarget, err := url.Parse(target)
if err != nil { if err != nil {
return nil return nil
@ -140,6 +140,11 @@ func ScrapNewsIndex(target string) []models.NewsArticle {
ua = ScrapNewsDefaultUA ua = ScrapNewsDefaultUA
} }
var limit int
if len(maxPages) > 0 && maxPages[0] > 0 {
limit = maxPages[0]
}
c := colly.NewCollector( c := colly.NewCollector(
colly.UserAgent(ua), colly.UserAgent(ua),
colly.MaxDepth(3), colly.MaxDepth(3),
@ -160,6 +165,10 @@ func ScrapNewsIndex(target string) []models.NewsArticle {
var result []models.NewsArticle var result []models.NewsArticle
c.OnHTML("main a", func(e *colly.HTMLElement) { c.OnHTML("main a", func(e *colly.HTMLElement) {
if limit <= 0 {
return
}
url := e.Attr("href") url := e.Attr("href")
if strings.HasPrefix(url, "#") || strings.HasPrefix(url, "javascript:") || strings.HasPrefix(url, "mailto:") { if strings.HasPrefix(url, "#") || strings.HasPrefix(url, "javascript:") || strings.HasPrefix(url, "mailto:") {
return return
@ -168,6 +177,7 @@ func ScrapNewsIndex(target string) []models.NewsArticle {
url = fmt.Sprintf("%s%s", baseUrl, url) url = fmt.Sprintf("%s%s", baseUrl, url)
} }
limit--
article, err := ScrapNews(url) article, err := ScrapNews(url)
if err != nil { if err != nil {
log.Warn().Err(err).Str("url", url).Msg("Failed to scrap a news article...") log.Warn().Err(err).Str("url", url).Msg("Failed to scrap a news article...")