2025-01-25 22:05:38 +08:00
|
|
|
package services
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
2025-01-25 22:48:14 +08:00
|
|
|
"strconv"
|
2025-01-25 22:05:38 +08:00
|
|
|
|
|
|
|
"git.solsynth.dev/hypernet/reader/pkg/internal/database"
|
|
|
|
"git.solsynth.dev/hypernet/reader/pkg/internal/models"
|
|
|
|
"github.com/rs/zerolog/log"
|
2025-01-25 23:54:50 +08:00
|
|
|
"github.com/samber/lo"
|
2025-01-25 22:05:38 +08:00
|
|
|
"github.com/sogko/go-wordpress"
|
|
|
|
"github.com/spf13/viper"
|
2025-01-25 23:54:50 +08:00
|
|
|
"gorm.io/gorm/clause"
|
2025-01-25 22:05:38 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
var NewsSources []models.NewsSource
|
|
|
|
|
|
|
|
func LoadNewsSources() error {
|
|
|
|
if err := viper.UnmarshalKey("sources", &NewsSources); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
log.Info().Int("count", len(NewsSources)).Msg("Loaded news sources configuration.")
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2025-01-25 22:48:14 +08:00
|
|
|
func ScanNewsSourcesNoEager() {
|
|
|
|
ScanNewsSources(false)
|
|
|
|
}
|
|
|
|
|
|
|
|
func ScanNewsSources(eager ...bool) {
|
2025-01-25 22:05:38 +08:00
|
|
|
var results []models.NewsArticle
|
|
|
|
for _, src := range NewsSources {
|
2025-01-25 23:54:50 +08:00
|
|
|
if !src.Enabled {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2025-01-25 22:05:38 +08:00
|
|
|
log.Debug().Str("source", src.ID).Msg("Scanning news source...")
|
|
|
|
result, err := NewsSourceRead(src)
|
|
|
|
if err != nil {
|
|
|
|
log.Warn().Err(err).Str("source", src.ID).Msg("Failed to scan a news source.")
|
|
|
|
}
|
|
|
|
results = append(results, result...)
|
|
|
|
log.Info().Str("source", src.ID).Int("count", len(result)).Msg("Scanned a news sources.")
|
|
|
|
}
|
|
|
|
log.Info().Int("count", len(results)).Msg("Scanned all news sources.")
|
2025-01-25 23:54:50 +08:00
|
|
|
|
|
|
|
results = lo.UniqBy(results, func(item models.NewsArticle) string {
|
|
|
|
return item.Hash
|
|
|
|
})
|
|
|
|
|
|
|
|
database.C.Clauses(clause.OnConflict{
|
|
|
|
UpdateAll: true,
|
|
|
|
}).Create(&results)
|
2025-01-25 22:05:38 +08:00
|
|
|
}
|
|
|
|
|
2025-01-25 22:48:14 +08:00
|
|
|
func NewsSourceRead(src models.NewsSource, eager ...bool) ([]models.NewsArticle, error) {
|
2025-01-25 22:05:38 +08:00
|
|
|
switch src.Type {
|
|
|
|
case "wordpress":
|
2025-01-25 23:54:50 +08:00
|
|
|
return newsSourceReadWordpress(src, eager...)
|
|
|
|
case "scrap":
|
|
|
|
return newsSourceReadScrap(src)
|
2025-01-25 22:05:38 +08:00
|
|
|
default:
|
|
|
|
return nil, fmt.Errorf("unsupported news source type: %s", src.Type)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2025-01-25 22:48:14 +08:00
|
|
|
func newsSourceReadWordpress(src models.NewsSource, eager ...bool) ([]models.NewsArticle, error) {
|
|
|
|
wpConvert := func(post wordpress.Post) models.NewsArticle {
|
|
|
|
article := &models.NewsArticle{
|
|
|
|
Title: post.Title.Rendered,
|
|
|
|
Description: post.Excerpt.Rendered,
|
|
|
|
Content: post.Content.Rendered,
|
|
|
|
URL: post.Link,
|
|
|
|
Source: src.ID,
|
|
|
|
}
|
|
|
|
article.GenHash()
|
|
|
|
return *article
|
|
|
|
}
|
|
|
|
|
2025-01-25 22:05:38 +08:00
|
|
|
client := wordpress.NewClient(&wordpress.Options{
|
|
|
|
BaseAPIURL: src.Source,
|
|
|
|
})
|
|
|
|
|
2025-01-25 22:48:14 +08:00
|
|
|
posts, resp, _, err := client.Posts().List(nil)
|
2025-01-25 22:05:38 +08:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
var result []models.NewsArticle
|
|
|
|
for _, post := range posts {
|
2025-01-25 22:48:14 +08:00
|
|
|
result = append(result, wpConvert(post))
|
|
|
|
}
|
|
|
|
|
|
|
|
if len(eager) > 0 && eager[0] {
|
|
|
|
totalPagesRaw := resp.Header.Get("X-WP-TotalPages")
|
|
|
|
totalPages, _ := strconv.Atoi(totalPagesRaw)
|
|
|
|
depth := min(totalPages, src.Depth)
|
|
|
|
for page := 2; page <= depth; page++ {
|
|
|
|
posts, _, _, err := client.Posts().List(nil)
|
|
|
|
if err != nil {
|
|
|
|
return result, nil
|
|
|
|
}
|
|
|
|
for _, post := range posts {
|
|
|
|
result = append(result, wpConvert(post))
|
|
|
|
}
|
2025-01-25 22:05:38 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return result, nil
|
|
|
|
}
|
2025-01-25 23:54:50 +08:00
|
|
|
|
|
|
|
func newsSourceReadScrap(src models.NewsSource) ([]models.NewsArticle, error) {
|
|
|
|
pgConvert := func(article models.NewsArticle) models.NewsArticle {
|
|
|
|
art := &article
|
|
|
|
art.GenHash()
|
|
|
|
art.Source = src.ID
|
|
|
|
article = *art
|
|
|
|
return article
|
|
|
|
}
|
|
|
|
|
|
|
|
result := ScrapNewsIndex(src.Source)
|
|
|
|
|
|
|
|
for idx, page := range result {
|
|
|
|
result[idx] = pgConvert(page)
|
|
|
|
}
|
|
|
|
|
|
|
|
return result, nil
|
|
|
|
}
|