✨ Scrap news site
This commit is contained in:
		| @@ -1,9 +1,10 @@ | |||||||
| package models | package models | ||||||
|  |  | ||||||
| type NewsSource struct { | type NewsSource struct { | ||||||
| 	ID     string `json:"id"` | 	ID      string `json:"id"` | ||||||
| 	Label  string `json:"label"` | 	Label   string `json:"label"` | ||||||
| 	Type   string `json:"type"` | 	Type    string `json:"type"` | ||||||
| 	Source string `json:"source"` | 	Source  string `json:"source"` | ||||||
| 	Depth  int    `json:"depth"` | 	Depth   int    `json:"depth"` | ||||||
|  | 	Enabled bool   `json:"enabled"` | ||||||
| } | } | ||||||
|   | |||||||
| @@ -2,6 +2,7 @@ package api | |||||||
|  |  | ||||||
| import ( | import ( | ||||||
| 	"git.solsynth.dev/hypernet/nexus/pkg/nex/sec" | 	"git.solsynth.dev/hypernet/nexus/pkg/nex/sec" | ||||||
|  | 	"git.solsynth.dev/hypernet/reader/pkg/internal/services" | ||||||
| 	"github.com/gofiber/fiber/v2" | 	"github.com/gofiber/fiber/v2" | ||||||
| ) | ) | ||||||
|  |  | ||||||
| @@ -12,6 +13,11 @@ func MapAPIs(app *fiber.App, baseURL string) { | |||||||
|  |  | ||||||
| 		admin := api.Group("/admin").Name("Admin") | 		admin := api.Group("/admin").Name("Admin") | ||||||
| 		{ | 		{ | ||||||
|  | 			admin.Get("/scan", func(c *fiber.Ctx) error { | ||||||
|  | 				services.ScanNewsSources() | ||||||
|  | 				return c.SendStatus(fiber.StatusOK) | ||||||
|  | 			}) | ||||||
|  |  | ||||||
| 			admin.Post("/scan", sec.ValidatorMiddleware, adminTriggerScanTask) | 			admin.Post("/scan", sec.ValidatorMiddleware, adminTriggerScanTask) | ||||||
| 		} | 		} | ||||||
|  |  | ||||||
|   | |||||||
| @@ -16,7 +16,9 @@ func listNewsArticles(c *fiber.Ctx) error { | |||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	var articles []models.NewsArticle | 	var articles []models.NewsArticle | ||||||
| 	if err := database.C.Limit(take).Offset(offset).Omit("Content").Find(&articles).Error; err != nil { | 	if err := database.C.Limit(take).Offset(offset). | ||||||
|  | 		Omit("Content").Order("created_at DESC"). | ||||||
|  | 		Find(&articles).Error; err != nil { | ||||||
| 		return fiber.NewError(fiber.StatusInternalServerError, err.Error()) | 		return fiber.NewError(fiber.StatusInternalServerError, err.Error()) | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
|   | |||||||
| @@ -7,8 +7,10 @@ import ( | |||||||
| 	"git.solsynth.dev/hypernet/reader/pkg/internal/database" | 	"git.solsynth.dev/hypernet/reader/pkg/internal/database" | ||||||
| 	"git.solsynth.dev/hypernet/reader/pkg/internal/models" | 	"git.solsynth.dev/hypernet/reader/pkg/internal/models" | ||||||
| 	"github.com/rs/zerolog/log" | 	"github.com/rs/zerolog/log" | ||||||
|  | 	"github.com/samber/lo" | ||||||
| 	"github.com/sogko/go-wordpress" | 	"github.com/sogko/go-wordpress" | ||||||
| 	"github.com/spf13/viper" | 	"github.com/spf13/viper" | ||||||
|  | 	"gorm.io/gorm/clause" | ||||||
| ) | ) | ||||||
|  |  | ||||||
| var NewsSources []models.NewsSource | var NewsSources []models.NewsSource | ||||||
| @@ -28,6 +30,10 @@ func ScanNewsSourcesNoEager() { | |||||||
| func ScanNewsSources(eager ...bool) { | func ScanNewsSources(eager ...bool) { | ||||||
| 	var results []models.NewsArticle | 	var results []models.NewsArticle | ||||||
| 	for _, src := range NewsSources { | 	for _, src := range NewsSources { | ||||||
|  | 		if !src.Enabled { | ||||||
|  | 			continue | ||||||
|  | 		} | ||||||
|  |  | ||||||
| 		log.Debug().Str("source", src.ID).Msg("Scanning news source...") | 		log.Debug().Str("source", src.ID).Msg("Scanning news source...") | ||||||
| 		result, err := NewsSourceRead(src) | 		result, err := NewsSourceRead(src) | ||||||
| 		if err != nil { | 		if err != nil { | ||||||
| @@ -37,13 +43,22 @@ func ScanNewsSources(eager ...bool) { | |||||||
| 		log.Info().Str("source", src.ID).Int("count", len(result)).Msg("Scanned a news sources.") | 		log.Info().Str("source", src.ID).Int("count", len(result)).Msg("Scanned a news sources.") | ||||||
| 	} | 	} | ||||||
| 	log.Info().Int("count", len(results)).Msg("Scanned all news sources.") | 	log.Info().Int("count", len(results)).Msg("Scanned all news sources.") | ||||||
| 	database.C.Save(&results) |  | ||||||
|  | 	results = lo.UniqBy(results, func(item models.NewsArticle) string { | ||||||
|  | 		return item.Hash | ||||||
|  | 	}) | ||||||
|  |  | ||||||
|  | 	database.C.Clauses(clause.OnConflict{ | ||||||
|  | 		UpdateAll: true, | ||||||
|  | 	}).Create(&results) | ||||||
| } | } | ||||||
|  |  | ||||||
| func NewsSourceRead(src models.NewsSource, eager ...bool) ([]models.NewsArticle, error) { | func NewsSourceRead(src models.NewsSource, eager ...bool) ([]models.NewsArticle, error) { | ||||||
| 	switch src.Type { | 	switch src.Type { | ||||||
| 	case "wordpress": | 	case "wordpress": | ||||||
| 		return newsSourceReadWordpress(src) | 		return newsSourceReadWordpress(src, eager...) | ||||||
|  | 	case "scrap": | ||||||
|  | 		return newsSourceReadScrap(src) | ||||||
| 	default: | 	default: | ||||||
| 		return nil, fmt.Errorf("unsupported news source type: %s", src.Type) | 		return nil, fmt.Errorf("unsupported news source type: %s", src.Type) | ||||||
| 	} | 	} | ||||||
| @@ -93,3 +108,21 @@ func newsSourceReadWordpress(src models.NewsSource, eager ...bool) ([]models.New | |||||||
|  |  | ||||||
| 	return result, nil | 	return result, nil | ||||||
| } | } | ||||||
|  |  | ||||||
|  | func newsSourceReadScrap(src models.NewsSource) ([]models.NewsArticle, error) { | ||||||
|  | 	pgConvert := func(article models.NewsArticle) models.NewsArticle { | ||||||
|  | 		art := &article | ||||||
|  | 		art.GenHash() | ||||||
|  | 		art.Source = src.ID | ||||||
|  | 		article = *art | ||||||
|  | 		return article | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	result := ScrapNewsIndex(src.Source) | ||||||
|  |  | ||||||
|  | 	for idx, page := range result { | ||||||
|  | 		result[idx] = pgConvert(page) | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	return result, nil | ||||||
|  | } | ||||||
|   | |||||||
| @@ -3,8 +3,11 @@ package services | |||||||
| import ( | import ( | ||||||
| 	"crypto/md5" | 	"crypto/md5" | ||||||
| 	"encoding/hex" | 	"encoding/hex" | ||||||
|  | 	"fmt" | ||||||
| 	"net" | 	"net" | ||||||
| 	"net/http" | 	"net/http" | ||||||
|  | 	"net/url" | ||||||
|  | 	"strings" | ||||||
| 	"time" | 	"time" | ||||||
|  |  | ||||||
| 	"git.solsynth.dev/hypernet/reader/pkg/internal/database" | 	"git.solsynth.dev/hypernet/reader/pkg/internal/database" | ||||||
| @@ -122,3 +125,109 @@ func ScrapLink(target string) (*models.LinkMeta, error) { | |||||||
|  |  | ||||||
| 	return meta, c.Visit(target) | 	return meta, c.Visit(target) | ||||||
| } | } | ||||||
|  |  | ||||||
|  | const ScrapNewsDefaultUA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1.1 Safari/605.1.15" | ||||||
|  |  | ||||||
|  | func ScrapNewsIndex(target string) []models.NewsArticle { | ||||||
|  | 	parsedTarget, err := url.Parse(target) | ||||||
|  | 	if err != nil { | ||||||
|  | 		return nil | ||||||
|  | 	} | ||||||
|  | 	baseUrl := fmt.Sprintf("%s://%s", parsedTarget.Scheme, parsedTarget.Host) | ||||||
|  |  | ||||||
|  | 	ua := viper.GetString("scraper.news_ua") | ||||||
|  | 	if len(ua) == 0 { | ||||||
|  | 		ua = ScrapNewsDefaultUA | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	c := colly.NewCollector( | ||||||
|  | 		colly.UserAgent(ua), | ||||||
|  | 		colly.MaxDepth(3), | ||||||
|  | 	) | ||||||
|  |  | ||||||
|  | 	c.WithTransport(&http.Transport{ | ||||||
|  | 		Proxy: http.ProxyFromEnvironment, | ||||||
|  | 		DialContext: (&net.Dialer{ | ||||||
|  | 			Timeout:   60 * time.Second, | ||||||
|  | 			KeepAlive: 360 * time.Second, | ||||||
|  | 		}).DialContext, | ||||||
|  | 		MaxIdleConns:          100, | ||||||
|  | 		IdleConnTimeout:       90 * time.Second, | ||||||
|  | 		TLSHandshakeTimeout:   10 * time.Second, | ||||||
|  | 		ExpectContinueTimeout: 1 * time.Second, | ||||||
|  | 	}) | ||||||
|  |  | ||||||
|  | 	var result []models.NewsArticle | ||||||
|  |  | ||||||
|  | 	c.OnHTML("main a", func(e *colly.HTMLElement) { | ||||||
|  | 		url := e.Attr("href") | ||||||
|  | 		if strings.HasPrefix(url, "#") || strings.HasPrefix(url, "javascript:") || strings.HasPrefix(url, "mailto:") { | ||||||
|  | 			return | ||||||
|  | 		} | ||||||
|  | 		if !strings.HasPrefix(url, "http") { | ||||||
|  | 			url = fmt.Sprintf("%s%s", baseUrl, url) | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		article, err := ScrapNews(url) | ||||||
|  | 		if err != nil { | ||||||
|  | 			log.Warn().Err(err).Str("url", url).Msg("Failed to scrap a news article...") | ||||||
|  | 			return | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		log.Debug().Str("url", url).Msg("Scraped a news article...") | ||||||
|  | 		if article != nil { | ||||||
|  | 			result = append(result, *article) | ||||||
|  | 		} | ||||||
|  | 	}) | ||||||
|  |  | ||||||
|  | 	c.Visit(target) | ||||||
|  |  | ||||||
|  | 	return result | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func ScrapNews(target string) (*models.NewsArticle, error) { | ||||||
|  | 	ua := viper.GetString("scraper.news_ua") | ||||||
|  | 	if len(ua) == 0 { | ||||||
|  | 		ua = ScrapNewsDefaultUA | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	c := colly.NewCollector( | ||||||
|  | 		colly.UserAgent(ua), | ||||||
|  | 		colly.MaxDepth(3), | ||||||
|  | 	) | ||||||
|  |  | ||||||
|  | 	c.WithTransport(&http.Transport{ | ||||||
|  | 		Proxy: http.ProxyFromEnvironment, | ||||||
|  | 		DialContext: (&net.Dialer{ | ||||||
|  | 			Timeout:   60 * time.Second, | ||||||
|  | 			KeepAlive: 360 * time.Second, | ||||||
|  | 		}).DialContext, | ||||||
|  | 		MaxIdleConns:          100, | ||||||
|  | 		IdleConnTimeout:       90 * time.Second, | ||||||
|  | 		TLSHandshakeTimeout:   10 * time.Second, | ||||||
|  | 		ExpectContinueTimeout: 1 * time.Second, | ||||||
|  | 	}) | ||||||
|  |  | ||||||
|  | 	article := &models.NewsArticle{ | ||||||
|  | 		URL: target, | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	c.OnHTML("title", func(e *colly.HTMLElement) { | ||||||
|  | 		article.Title = e.Text | ||||||
|  | 	}) | ||||||
|  | 	c.OnHTML("meta[name]", func(e *colly.HTMLElement) { | ||||||
|  | 		switch e.Attr("name") { | ||||||
|  | 		case "description": | ||||||
|  | 			article.Description = e.Attr("content") | ||||||
|  | 		} | ||||||
|  | 	}) | ||||||
|  |  | ||||||
|  | 	c.OnHTML("article", func(e *colly.HTMLElement) { | ||||||
|  | 		article.Content, _ = e.DOM.Html() | ||||||
|  | 	}) | ||||||
|  | 	c.OnHTML("article img", func(e *colly.HTMLElement) { | ||||||
|  | 		article.Thumbnail = e.Attr("src") | ||||||
|  | 	}) | ||||||
|  |  | ||||||
|  | 	return article, c.Visit(target) | ||||||
|  | } | ||||||
|   | |||||||
| @@ -10,6 +10,19 @@ id = "shadiao" | |||||||
| label = "Shadiao Plus" | label = "Shadiao Plus" | ||||||
| type = "wordpress" | type = "wordpress" | ||||||
| source = "https://shadiao.plus/wp-json/wp/v2" | source = "https://shadiao.plus/wp-json/wp/v2" | ||||||
|  | enabled = false | ||||||
|  | [[sources]] | ||||||
|  | id = "cnn-politics" | ||||||
|  | label = "CNN Politics" | ||||||
|  | type = "scrap" | ||||||
|  | source = "https://www.cnn.com/politics" | ||||||
|  | enabled = false | ||||||
|  | [[sources]] | ||||||
|  | id = "bbc" | ||||||
|  | label = "BBC News" | ||||||
|  | type = "scrap" | ||||||
|  | source = "https://www.bbc.com/news" | ||||||
|  | enabled = true | ||||||
|  |  | ||||||
| [debug] | [debug] | ||||||
| database = true | database = true | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user