✨ Scrap news site

2025-01-25 23:54:50 +08:00
parent 48a04b09a4
commit 5aae59a629
6 changed files with 172 additions and 8 deletions
--- a/pkg/internal/models/source.go
+++ b/pkg/internal/models/source.go
@@ -1,9 +1,10 @@
 package models

 type NewsSource struct {
-	ID     string `json:"id"`
-	Label  string `json:"label"`
-	Type   string `json:"type"`
-	Source string `json:"source"`
-	Depth  int    `json:"depth"`
+	ID      string `json:"id"`
+	Label   string `json:"label"`
+	Type    string `json:"type"`
+	Source  string `json:"source"`
+	Depth   int    `json:"depth"`
+	Enabled bool   `json:"enabled"`
 }
--- a/pkg/internal/server/api/index.go
+++ b/pkg/internal/server/api/index.go
@@ -2,6 +2,7 @@ package api

 import (
 	"git.solsynth.dev/hypernet/nexus/pkg/nex/sec"
+	"git.solsynth.dev/hypernet/reader/pkg/internal/services"
 	"github.com/gofiber/fiber/v2"
 )

@@ -12,6 +13,11 @@ func MapAPIs(app *fiber.App, baseURL string) {

 		admin := api.Group("/admin").Name("Admin")
 		{
+			admin.Get("/scan", func(c *fiber.Ctx) error {
+				services.ScanNewsSources()
+				return c.SendStatus(fiber.StatusOK)
+			})
+
 			admin.Post("/scan", sec.ValidatorMiddleware, adminTriggerScanTask)
 		}

--- a/pkg/internal/server/api/news_api.go
+++ b/pkg/internal/server/api/news_api.go
@@ -16,7 +16,9 @@ func listNewsArticles(c *fiber.Ctx) error {
 	}

 	var articles []models.NewsArticle
-	if err := database.C.Limit(take).Offset(offset).Omit("Content").Find(&articles).Error; err != nil {
+	if err := database.C.Limit(take).Offset(offset).
+		Omit("Content").Order("created_at DESC").
+		Find(&articles).Error; err != nil {
 		return fiber.NewError(fiber.StatusInternalServerError, err.Error())
 	}

--- a/pkg/internal/services/reader.go
+++ b/pkg/internal/services/reader.go
@@ -7,8 +7,10 @@ import (
 	"git.solsynth.dev/hypernet/reader/pkg/internal/database"
 	"git.solsynth.dev/hypernet/reader/pkg/internal/models"
 	"github.com/rs/zerolog/log"
+	"github.com/samber/lo"
 	"github.com/sogko/go-wordpress"
 	"github.com/spf13/viper"
+	"gorm.io/gorm/clause"
 )

 var NewsSources []models.NewsSource
@@ -28,6 +30,10 @@ func ScanNewsSourcesNoEager() {
 func ScanNewsSources(eager ...bool) {
 	var results []models.NewsArticle
 	for _, src := range NewsSources {
+		if !src.Enabled {
+			continue
+		}
+
 		log.Debug().Str("source", src.ID).Msg("Scanning news source...")
 		result, err := NewsSourceRead(src)
 		if err != nil {
@@ -37,13 +43,22 @@ func ScanNewsSources(eager ...bool) {
 		log.Info().Str("source", src.ID).Int("count", len(result)).Msg("Scanned a news sources.")
 	}
 	log.Info().Int("count", len(results)).Msg("Scanned all news sources.")
-	database.C.Save(&results)
+
+	results = lo.UniqBy(results, func(item models.NewsArticle) string {
+		return item.Hash
+	})
+
+	database.C.Clauses(clause.OnConflict{
+		UpdateAll: true,
+	}).Create(&results)
 }

 func NewsSourceRead(src models.NewsSource, eager ...bool) ([]models.NewsArticle, error) {
 	switch src.Type {
 	case "wordpress":
-		return newsSourceReadWordpress(src)
+		return newsSourceReadWordpress(src, eager...)
+	case "scrap":
+		return newsSourceReadScrap(src)
 	default:
 		return nil, fmt.Errorf("unsupported news source type: %s", src.Type)
 	}
@@ -93,3 +108,21 @@ func newsSourceReadWordpress(src models.NewsSource, eager ...bool) ([]models.New

 	return result, nil
 }
+
+func newsSourceReadScrap(src models.NewsSource) ([]models.NewsArticle, error) {
+	pgConvert := func(article models.NewsArticle) models.NewsArticle {
+		art := &article
+		art.GenHash()
+		art.Source = src.ID
+		article = *art
+		return article
+	}
+
+	result := ScrapNewsIndex(src.Source)
+
+	for idx, page := range result {
+		result[idx] = pgConvert(page)
+	}
+
+	return result, nil
+}
--- a/pkg/internal/services/scraper.go
+++ b/pkg/internal/services/scraper.go
@@ -3,8 +3,11 @@ package services
 import (
 	"crypto/md5"
 	"encoding/hex"
+	"fmt"
 	"net"
 	"net/http"
+	"net/url"
+	"strings"
 	"time"

 	"git.solsynth.dev/hypernet/reader/pkg/internal/database"
@@ -122,3 +125,109 @@ func ScrapLink(target string) (*models.LinkMeta, error) {

 	return meta, c.Visit(target)
 }
+
+const ScrapNewsDefaultUA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1.1 Safari/605.1.15"
+
+func ScrapNewsIndex(target string) []models.NewsArticle {
+	parsedTarget, err := url.Parse(target)
+	if err != nil {
+		return nil
+	}
+	baseUrl := fmt.Sprintf("%s://%s", parsedTarget.Scheme, parsedTarget.Host)
+
+	ua := viper.GetString("scraper.news_ua")
+	if len(ua) == 0 {
+		ua = ScrapNewsDefaultUA
+	}
+
+	c := colly.NewCollector(
+		colly.UserAgent(ua),
+		colly.MaxDepth(3),
+	)
+
+	c.WithTransport(&http.Transport{
+		Proxy: http.ProxyFromEnvironment,
+		DialContext: (&net.Dialer{
+			Timeout:   60 * time.Second,
+			KeepAlive: 360 * time.Second,
+		}).DialContext,
+		MaxIdleConns:          100,
+		IdleConnTimeout:       90 * time.Second,
+		TLSHandshakeTimeout:   10 * time.Second,
+		ExpectContinueTimeout: 1 * time.Second,
+	})
+
+	var result []models.NewsArticle
+
+	c.OnHTML("main a", func(e *colly.HTMLElement) {
+		url := e.Attr("href")
+		if strings.HasPrefix(url, "#") || strings.HasPrefix(url, "javascript:") || strings.HasPrefix(url, "mailto:") {
+			return
+		}
+		if !strings.HasPrefix(url, "http") {
+			url = fmt.Sprintf("%s%s", baseUrl, url)
+		}
+
+		article, err := ScrapNews(url)
+		if err != nil {
+			log.Warn().Err(err).Str("url", url).Msg("Failed to scrap a news article...")
+			return
+		}
+
+		log.Debug().Str("url", url).Msg("Scraped a news article...")
+		if article != nil {
+			result = append(result, *article)
+		}
+	})
+
+	c.Visit(target)
+
+	return result
+}
+
+func ScrapNews(target string) (*models.NewsArticle, error) {
+	ua := viper.GetString("scraper.news_ua")
+	if len(ua) == 0 {
+		ua = ScrapNewsDefaultUA
+	}
+
+	c := colly.NewCollector(
+		colly.UserAgent(ua),
+		colly.MaxDepth(3),
+	)
+
+	c.WithTransport(&http.Transport{
+		Proxy: http.ProxyFromEnvironment,
+		DialContext: (&net.Dialer{
+			Timeout:   60 * time.Second,
+			KeepAlive: 360 * time.Second,
+		}).DialContext,
+		MaxIdleConns:          100,
+		IdleConnTimeout:       90 * time.Second,
+		TLSHandshakeTimeout:   10 * time.Second,
+		ExpectContinueTimeout: 1 * time.Second,
+	})
+
+	article := &models.NewsArticle{
+		URL: target,
+	}
+
+	c.OnHTML("title", func(e *colly.HTMLElement) {
+		article.Title = e.Text
+	})
+	c.OnHTML("meta[name]", func(e *colly.HTMLElement) {
+		switch e.Attr("name") {
+		case "description":
+			article.Description = e.Attr("content")
+		}
+	})
+
+	c.OnHTML("article", func(e *colly.HTMLElement) {
+		article.Content, _ = e.DOM.Html()
+	})
+	c.OnHTML("article img", func(e *colly.HTMLElement) {
+		article.Thumbnail = e.Attr("src")
+	})
+
+	return article, c.Visit(target)
+}
--- a/settings.toml
+++ b/settings.toml
@@ -10,6 +10,19 @@ id = "shadiao"
 label = "Shadiao Plus"
 type = "wordpress"
 source = "https://shadiao.plus/wp-json/wp/v2"
+enabled = false
+[[sources]]
+id = "cnn-politics"
+label = "CNN Politics"
+type = "scrap"
+source = "https://www.cnn.com/politics"
+enabled = false
+[[sources]]
+id = "bbc"
+label = "BBC News"
+type = "scrap"
+source = "https://www.bbc.com/news"
+enabled = true

 [debug]
 database = true