Read news basis

 Able to read wordpress site

btw the 10 yrs ago package still work properly, amazing...
means the wordpress api did not change a lot and the golang backward
compability is amazing!
This commit is contained in:
2025-01-25 22:05:38 +08:00
parent e055ef27e5
commit ab0a87106b
11 changed files with 185 additions and 19 deletions

View File

@ -7,6 +7,7 @@ import (
var AutoMaintainRange = []any{
&models.LinkMeta{},
&models.NewsArticle{},
}
func RunMigration(source *gorm.DB) error {

View File

@ -0,0 +1,32 @@
package models
import (
"crypto/md5"
"encoding/hex"
"git.solsynth.dev/hypernet/nexus/pkg/nex/cruda"
"github.com/google/uuid"
)
type NewsArticle struct {
cruda.BaseModel
Thumbnail string `json:"thumbnail"`
Title string `json:"title"`
Description string `json:"description"`
Content string `json:"content"`
URL string `json:"url"`
Hash string `json:"hash" gorm:"uniqueIndex"`
Source string `json:"source"`
}
func (v *NewsArticle) GenHash() *NewsArticle {
if len(v.URL) == 0 {
v.Hash = uuid.NewString()
return v
}
hash := md5.Sum([]byte(v.URL))
v.Hash = hex.EncodeToString(hash[:])
return v
}

View File

@ -0,0 +1,8 @@
package models
type NewsSource struct {
ID string `json:"id"`
Label string `json:"label"`
Type string `json:"type"`
Source string `json:"source"`
}

View File

@ -7,6 +7,7 @@ import (
func MapAPIs(app *fiber.App, baseURL string) {
api := app.Group(baseURL).Name("API")
{
api.Get("/well-known/sources", getNewsSources)
api.Get("/link/*", getLinkMeta)
}
}

View File

@ -0,0 +1,10 @@
package api
import (
"git.solsynth.dev/hypernet/reader/pkg/internal/services"
"github.com/gofiber/fiber/v2"
)
func getNewsSources(c *fiber.Ctx) error {
return c.JSON(services.NewsSources)
}

View File

@ -0,0 +1,71 @@
package services
import (
"fmt"
"git.solsynth.dev/hypernet/reader/pkg/internal/database"
"git.solsynth.dev/hypernet/reader/pkg/internal/models"
"github.com/rs/zerolog/log"
"github.com/sogko/go-wordpress"
"github.com/spf13/viper"
)
var NewsSources []models.NewsSource
func LoadNewsSources() error {
if err := viper.UnmarshalKey("sources", &NewsSources); err != nil {
return err
}
log.Info().Int("count", len(NewsSources)).Msg("Loaded news sources configuration.")
return nil
}
func ScanNewsSources() {
var results []models.NewsArticle
for _, src := range NewsSources {
log.Debug().Str("source", src.ID).Msg("Scanning news source...")
result, err := NewsSourceRead(src)
if err != nil {
log.Warn().Err(err).Str("source", src.ID).Msg("Failed to scan a news source.")
}
results = append(results, result...)
log.Info().Str("source", src.ID).Int("count", len(result)).Msg("Scanned a news sources.")
}
log.Info().Int("count", len(results)).Msg("Scanned all news sources.")
database.C.Save(&results)
}
func NewsSourceRead(src models.NewsSource) ([]models.NewsArticle, error) {
switch src.Type {
case "wordpress":
return newsSourceReadWordpress(src)
default:
return nil, fmt.Errorf("unsupported news source type: %s", src.Type)
}
}
func newsSourceReadWordpress(src models.NewsSource) ([]models.NewsArticle, error) {
client := wordpress.NewClient(&wordpress.Options{
BaseAPIURL: src.Source,
})
posts, _, _, err := client.Posts().List(nil)
if err != nil {
return nil, err
}
var result []models.NewsArticle
for _, post := range posts {
article := &models.NewsArticle{
Title: post.Title.Rendered,
Description: post.Excerpt.Rendered,
Content: post.Content.Rendered,
URL: post.Link,
Source: src.ID,
}
article.GenHash()
result = append(result, *article)
}
return result, nil
}

View File

@ -12,10 +12,11 @@ import (
"github.com/gocolly/colly"
"github.com/rs/zerolog/log"
"github.com/samber/lo"
"github.com/spf13/viper"
)
// We have to set the User-Agent to this so the sites will respond with opengraph data
const ScrapLinkUserAgent = "facebookexternalhit/1.1"
const ScrapLinkDefaultUA = "facebookexternalhit/1.1"
func GetLinkMetaFromCache(target string) (models.LinkMeta, error) {
hash := md5.Sum([]byte(target))
@ -40,8 +41,13 @@ func ScrapLink(target string) (*models.LinkMeta, error) {
return &cache, nil
}
ua := viper.GetString("scraper.expand_ua")
if len(ua) == 0 {
ua = ScrapLinkDefaultUA
}
c := colly.NewCollector(
colly.UserAgent(ScrapLinkUserAgent),
colly.UserAgent(ua),
colly.MaxDepth(3),
)

View File

@ -72,9 +72,15 @@ func main() {
log.Fatal().Err(err).Msg("An error occurred when initializing cache.")
}
// Load news sources
if err := services.LoadNewsSources(); err != nil {
log.Fatal().Err(err).Msg("An error occurred when loading news sources.")
}
// Configure timed tasks
quartz := cron.New(cron.WithLogger(cron.VerbosePrintfLogger(&log.Logger)))
quartz.AddFunc("@every 60m", services.DoAutoDatabaseCleanup)
quartz.AddFunc("@midnight", services.ScanNewsSources)
quartz.Start()
// Server