Compare commits
2 Commits
2a58d185eb
...
690bbd4b43
Author | SHA1 | Date | |
---|---|---|---|
690bbd4b43 | |||
1867b34735 |
@ -2,9 +2,14 @@ package api
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"git.solsynth.dev/hypernet/nexus/pkg/nex/sec"
|
"git.solsynth.dev/hypernet/nexus/pkg/nex/sec"
|
||||||
|
"git.solsynth.dev/hypernet/reader/pkg/internal/database"
|
||||||
|
"git.solsynth.dev/hypernet/reader/pkg/internal/models"
|
||||||
"git.solsynth.dev/hypernet/reader/pkg/internal/server/exts"
|
"git.solsynth.dev/hypernet/reader/pkg/internal/server/exts"
|
||||||
"git.solsynth.dev/hypernet/reader/pkg/internal/services"
|
"git.solsynth.dev/hypernet/reader/pkg/internal/services"
|
||||||
"github.com/gofiber/fiber/v2"
|
"github.com/gofiber/fiber/v2"
|
||||||
|
"github.com/rs/zerolog/log"
|
||||||
|
"github.com/samber/lo"
|
||||||
|
"gorm.io/gorm/clause"
|
||||||
)
|
)
|
||||||
|
|
||||||
func adminTriggerScanTask(c *fiber.Ctx) error {
|
func adminTriggerScanTask(c *fiber.Ctx) error {
|
||||||
@ -13,13 +18,43 @@ func adminTriggerScanTask(c *fiber.Ctx) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var data struct {
|
var data struct {
|
||||||
Eager bool `json:"eager"`
|
Eager bool `json:"eager"`
|
||||||
|
Sources []string `json:"sources"`
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := exts.BindAndValidate(c, &data); err != nil {
|
if err := exts.BindAndValidate(c, &data); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
go services.ScanNewsSources(data.Eager)
|
go func() {
|
||||||
|
count := 0
|
||||||
|
for _, src := range services.NewsSources {
|
||||||
|
if !src.Enabled {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if len(data.Sources) > 0 && !lo.Contains(data.Sources, src.ID) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Debug().Str("source", src.ID).Msg("Scanning news source...")
|
||||||
|
result, err := services.NewsSourceRead(src, data.Eager)
|
||||||
|
if err != nil {
|
||||||
|
log.Warn().Err(err).Str("source", src.ID).Msg("Failed to scan a news source.")
|
||||||
|
}
|
||||||
|
|
||||||
|
result = lo.UniqBy(result, func(item models.NewsArticle) string {
|
||||||
|
return item.Hash
|
||||||
|
})
|
||||||
|
database.C.Clauses(clause.OnConflict{
|
||||||
|
UpdateAll: true,
|
||||||
|
}).Create(&result)
|
||||||
|
|
||||||
|
log.Info().Str("source", src.ID).Int("count", len(result)).Msg("Scanned a news sources.")
|
||||||
|
count += len(result)
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Info().Int("count", count).Msg("Scanned all news sources.")
|
||||||
|
}()
|
||||||
|
|
||||||
return c.SendStatus(fiber.StatusOK)
|
return c.SendStatus(fiber.StatusOK)
|
||||||
}
|
}
|
||||||
|
@ -62,9 +62,9 @@ func NewsSourceRead(src models.NewsSource, eager ...bool) ([]models.NewsArticle,
|
|||||||
case "wordpress":
|
case "wordpress":
|
||||||
return newsSourceReadWordpress(src, eager...)
|
return newsSourceReadWordpress(src, eager...)
|
||||||
case "scrap":
|
case "scrap":
|
||||||
return newsSourceReadScrap(src)
|
return newsSourceReadScrap(src, eager...)
|
||||||
case "feed":
|
case "feed":
|
||||||
return newsSourceReadFeed(src)
|
return newsSourceReadFeed(src, eager...)
|
||||||
default:
|
default:
|
||||||
return nil, fmt.Errorf("unsupported news source type: %s", src.Type)
|
return nil, fmt.Errorf("unsupported news source type: %s", src.Type)
|
||||||
}
|
}
|
||||||
@ -119,7 +119,7 @@ func newsSourceReadWordpress(src models.NewsSource, eager ...bool) ([]models.New
|
|||||||
return result, nil
|
return result, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func newsSourceReadFeed(src models.NewsSource) ([]models.NewsArticle, error) {
|
func newsSourceReadFeed(src models.NewsSource, eager ...bool) ([]models.NewsArticle, error) {
|
||||||
pgConvert := func(article models.NewsArticle) models.NewsArticle {
|
pgConvert := func(article models.NewsArticle) models.NewsArticle {
|
||||||
art := &article
|
art := &article
|
||||||
art.GenHash()
|
art.GenHash()
|
||||||
@ -133,8 +133,15 @@ func newsSourceReadFeed(src models.NewsSource) ([]models.NewsArticle, error) {
|
|||||||
fp := gofeed.NewParser()
|
fp := gofeed.NewParser()
|
||||||
feed, _ := fp.ParseURLWithContext(src.Source, ctx)
|
feed, _ := fp.ParseURLWithContext(src.Source, ctx)
|
||||||
|
|
||||||
|
maxPages := lo.Ternary(len(eager) > 0 && eager[0], len(feed.Items), src.Depth)
|
||||||
|
|
||||||
var result []models.NewsArticle
|
var result []models.NewsArticle
|
||||||
for _, item := range feed.Items {
|
for _, item := range feed.Items {
|
||||||
|
if maxPages <= 0 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
maxPages--
|
||||||
parent := models.NewsArticle{
|
parent := models.NewsArticle{
|
||||||
URL: item.Link,
|
URL: item.Link,
|
||||||
Title: item.Title,
|
Title: item.Title,
|
||||||
@ -160,7 +167,7 @@ func newsSourceReadFeed(src models.NewsSource) ([]models.NewsArticle, error) {
|
|||||||
return result, nil
|
return result, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func newsSourceReadScrap(src models.NewsSource) ([]models.NewsArticle, error) {
|
func newsSourceReadScrap(src models.NewsSource, eager ...bool) ([]models.NewsArticle, error) {
|
||||||
pgConvert := func(article models.NewsArticle) models.NewsArticle {
|
pgConvert := func(article models.NewsArticle) models.NewsArticle {
|
||||||
art := &article
|
art := &article
|
||||||
art.GenHash()
|
art.GenHash()
|
||||||
@ -169,7 +176,8 @@ func newsSourceReadScrap(src models.NewsSource) ([]models.NewsArticle, error) {
|
|||||||
return article
|
return article
|
||||||
}
|
}
|
||||||
|
|
||||||
result := ScrapNewsIndex(src.Source)
|
maxPages := lo.Ternary(len(eager) > 0 && eager[0], 0, src.Depth)
|
||||||
|
result := ScrapNewsIndex(src.Source, maxPages)
|
||||||
|
|
||||||
for idx, page := range result {
|
for idx, page := range result {
|
||||||
result[idx] = pgConvert(page)
|
result[idx] = pgConvert(page)
|
||||||
|
@ -128,7 +128,7 @@ func ScrapLink(target string) (*models.LinkMeta, error) {
|
|||||||
|
|
||||||
const ScrapNewsDefaultUA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1.1 Safari/605.1.15"
|
const ScrapNewsDefaultUA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1.1 Safari/605.1.15"
|
||||||
|
|
||||||
func ScrapNewsIndex(target string) []models.NewsArticle {
|
func ScrapNewsIndex(target string, maxPages ...int) []models.NewsArticle {
|
||||||
parsedTarget, err := url.Parse(target)
|
parsedTarget, err := url.Parse(target)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil
|
return nil
|
||||||
@ -140,6 +140,11 @@ func ScrapNewsIndex(target string) []models.NewsArticle {
|
|||||||
ua = ScrapNewsDefaultUA
|
ua = ScrapNewsDefaultUA
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var limit int
|
||||||
|
if len(maxPages) > 0 && maxPages[0] > 0 {
|
||||||
|
limit = maxPages[0]
|
||||||
|
}
|
||||||
|
|
||||||
c := colly.NewCollector(
|
c := colly.NewCollector(
|
||||||
colly.UserAgent(ua),
|
colly.UserAgent(ua),
|
||||||
colly.MaxDepth(3),
|
colly.MaxDepth(3),
|
||||||
@ -160,6 +165,10 @@ func ScrapNewsIndex(target string) []models.NewsArticle {
|
|||||||
var result []models.NewsArticle
|
var result []models.NewsArticle
|
||||||
|
|
||||||
c.OnHTML("main a", func(e *colly.HTMLElement) {
|
c.OnHTML("main a", func(e *colly.HTMLElement) {
|
||||||
|
if limit <= 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
url := e.Attr("href")
|
url := e.Attr("href")
|
||||||
if strings.HasPrefix(url, "#") || strings.HasPrefix(url, "javascript:") || strings.HasPrefix(url, "mailto:") {
|
if strings.HasPrefix(url, "#") || strings.HasPrefix(url, "javascript:") || strings.HasPrefix(url, "mailto:") {
|
||||||
return
|
return
|
||||||
@ -168,6 +177,7 @@ func ScrapNewsIndex(target string) []models.NewsArticle {
|
|||||||
url = fmt.Sprintf("%s%s", baseUrl, url)
|
url = fmt.Sprintf("%s%s", baseUrl, url)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
limit--
|
||||||
article, err := ScrapNews(url)
|
article, err := ScrapNews(url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn().Err(err).Str("url", url).Msg("Failed to scrap a news article...")
|
log.Warn().Err(err).Str("url", url).Msg("Failed to scrap a news article...")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user