diff --git a/pkg/internal/models/source.go b/pkg/internal/models/source.go index 79cd471..aa8af64 100644 --- a/pkg/internal/models/source.go +++ b/pkg/internal/models/source.go @@ -1,9 +1,10 @@ package models type NewsSource struct { - ID string `json:"id"` - Label string `json:"label"` - Type string `json:"type"` - Source string `json:"source"` - Depth int `json:"depth"` + ID string `json:"id"` + Label string `json:"label"` + Type string `json:"type"` + Source string `json:"source"` + Depth int `json:"depth"` + Enabled bool `json:"enabled"` } diff --git a/pkg/internal/server/api/index.go b/pkg/internal/server/api/index.go index 76abad1..24803eb 100644 --- a/pkg/internal/server/api/index.go +++ b/pkg/internal/server/api/index.go @@ -2,6 +2,7 @@ package api import ( "git.solsynth.dev/hypernet/nexus/pkg/nex/sec" + "git.solsynth.dev/hypernet/reader/pkg/internal/services" "github.com/gofiber/fiber/v2" ) @@ -12,6 +13,11 @@ func MapAPIs(app *fiber.App, baseURL string) { admin := api.Group("/admin").Name("Admin") { + admin.Get("/scan", func(c *fiber.Ctx) error { + services.ScanNewsSources() + return c.SendStatus(fiber.StatusOK) + }) + admin.Post("/scan", sec.ValidatorMiddleware, adminTriggerScanTask) } diff --git a/pkg/internal/server/api/news_api.go b/pkg/internal/server/api/news_api.go index 854162e..cca6118 100644 --- a/pkg/internal/server/api/news_api.go +++ b/pkg/internal/server/api/news_api.go @@ -16,7 +16,9 @@ func listNewsArticles(c *fiber.Ctx) error { } var articles []models.NewsArticle - if err := database.C.Limit(take).Offset(offset).Omit("Content").Find(&articles).Error; err != nil { + if err := database.C.Limit(take).Offset(offset). + Omit("Content").Order("created_at DESC"). + Find(&articles).Error; err != nil { return fiber.NewError(fiber.StatusInternalServerError, err.Error()) } diff --git a/pkg/internal/services/reader.go b/pkg/internal/services/reader.go index 7a62b03..4751005 100644 --- a/pkg/internal/services/reader.go +++ b/pkg/internal/services/reader.go @@ -7,8 +7,10 @@ import ( "git.solsynth.dev/hypernet/reader/pkg/internal/database" "git.solsynth.dev/hypernet/reader/pkg/internal/models" "github.com/rs/zerolog/log" + "github.com/samber/lo" "github.com/sogko/go-wordpress" "github.com/spf13/viper" + "gorm.io/gorm/clause" ) var NewsSources []models.NewsSource @@ -28,6 +30,10 @@ func ScanNewsSourcesNoEager() { func ScanNewsSources(eager ...bool) { var results []models.NewsArticle for _, src := range NewsSources { + if !src.Enabled { + continue + } + log.Debug().Str("source", src.ID).Msg("Scanning news source...") result, err := NewsSourceRead(src) if err != nil { @@ -37,13 +43,22 @@ func ScanNewsSources(eager ...bool) { log.Info().Str("source", src.ID).Int("count", len(result)).Msg("Scanned a news sources.") } log.Info().Int("count", len(results)).Msg("Scanned all news sources.") - database.C.Save(&results) + + results = lo.UniqBy(results, func(item models.NewsArticle) string { + return item.Hash + }) + + database.C.Clauses(clause.OnConflict{ + UpdateAll: true, + }).Create(&results) } func NewsSourceRead(src models.NewsSource, eager ...bool) ([]models.NewsArticle, error) { switch src.Type { case "wordpress": - return newsSourceReadWordpress(src) + return newsSourceReadWordpress(src, eager...) + case "scrap": + return newsSourceReadScrap(src) default: return nil, fmt.Errorf("unsupported news source type: %s", src.Type) } @@ -93,3 +108,21 @@ func newsSourceReadWordpress(src models.NewsSource, eager ...bool) ([]models.New return result, nil } + +func newsSourceReadScrap(src models.NewsSource) ([]models.NewsArticle, error) { + pgConvert := func(article models.NewsArticle) models.NewsArticle { + art := &article + art.GenHash() + art.Source = src.ID + article = *art + return article + } + + result := ScrapNewsIndex(src.Source) + + for idx, page := range result { + result[idx] = pgConvert(page) + } + + return result, nil +} diff --git a/pkg/internal/services/scraper.go b/pkg/internal/services/scraper.go index e772e06..89ccc6e 100644 --- a/pkg/internal/services/scraper.go +++ b/pkg/internal/services/scraper.go @@ -3,8 +3,11 @@ package services import ( "crypto/md5" "encoding/hex" + "fmt" "net" "net/http" + "net/url" + "strings" "time" "git.solsynth.dev/hypernet/reader/pkg/internal/database" @@ -122,3 +125,109 @@ func ScrapLink(target string) (*models.LinkMeta, error) { return meta, c.Visit(target) } + +const ScrapNewsDefaultUA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1.1 Safari/605.1.15" + +func ScrapNewsIndex(target string) []models.NewsArticle { + parsedTarget, err := url.Parse(target) + if err != nil { + return nil + } + baseUrl := fmt.Sprintf("%s://%s", parsedTarget.Scheme, parsedTarget.Host) + + ua := viper.GetString("scraper.news_ua") + if len(ua) == 0 { + ua = ScrapNewsDefaultUA + } + + c := colly.NewCollector( + colly.UserAgent(ua), + colly.MaxDepth(3), + ) + + c.WithTransport(&http.Transport{ + Proxy: http.ProxyFromEnvironment, + DialContext: (&net.Dialer{ + Timeout: 60 * time.Second, + KeepAlive: 360 * time.Second, + }).DialContext, + MaxIdleConns: 100, + IdleConnTimeout: 90 * time.Second, + TLSHandshakeTimeout: 10 * time.Second, + ExpectContinueTimeout: 1 * time.Second, + }) + + var result []models.NewsArticle + + c.OnHTML("main a", func(e *colly.HTMLElement) { + url := e.Attr("href") + if strings.HasPrefix(url, "#") || strings.HasPrefix(url, "javascript:") || strings.HasPrefix(url, "mailto:") { + return + } + if !strings.HasPrefix(url, "http") { + url = fmt.Sprintf("%s%s", baseUrl, url) + } + + article, err := ScrapNews(url) + if err != nil { + log.Warn().Err(err).Str("url", url).Msg("Failed to scrap a news article...") + return + } + + log.Debug().Str("url", url).Msg("Scraped a news article...") + if article != nil { + result = append(result, *article) + } + }) + + c.Visit(target) + + return result +} + +func ScrapNews(target string) (*models.NewsArticle, error) { + ua := viper.GetString("scraper.news_ua") + if len(ua) == 0 { + ua = ScrapNewsDefaultUA + } + + c := colly.NewCollector( + colly.UserAgent(ua), + colly.MaxDepth(3), + ) + + c.WithTransport(&http.Transport{ + Proxy: http.ProxyFromEnvironment, + DialContext: (&net.Dialer{ + Timeout: 60 * time.Second, + KeepAlive: 360 * time.Second, + }).DialContext, + MaxIdleConns: 100, + IdleConnTimeout: 90 * time.Second, + TLSHandshakeTimeout: 10 * time.Second, + ExpectContinueTimeout: 1 * time.Second, + }) + + article := &models.NewsArticle{ + URL: target, + } + + c.OnHTML("title", func(e *colly.HTMLElement) { + article.Title = e.Text + }) + c.OnHTML("meta[name]", func(e *colly.HTMLElement) { + switch e.Attr("name") { + case "description": + article.Description = e.Attr("content") + } + }) + + c.OnHTML("article", func(e *colly.HTMLElement) { + article.Content, _ = e.DOM.Html() + }) + c.OnHTML("article img", func(e *colly.HTMLElement) { + article.Thumbnail = e.Attr("src") + }) + + return article, c.Visit(target) +} diff --git a/settings.toml b/settings.toml index 2434443..42bf4e0 100644 --- a/settings.toml +++ b/settings.toml @@ -10,6 +10,19 @@ id = "shadiao" label = "Shadiao Plus" type = "wordpress" source = "https://shadiao.plus/wp-json/wp/v2" +enabled = false +[[sources]] +id = "cnn-politics" +label = "CNN Politics" +type = "scrap" +source = "https://www.cnn.com/politics" +enabled = false +[[sources]] +id = "bbc" +label = "BBC News" +type = "scrap" +source = "https://www.bbc.com/news" +enabled = true [debug] database = true