✨ Scrap news site
This commit is contained in:
		| @@ -1,9 +1,10 @@ | ||||
| package models | ||||
|  | ||||
| type NewsSource struct { | ||||
| 	ID     string `json:"id"` | ||||
| 	Label  string `json:"label"` | ||||
| 	Type   string `json:"type"` | ||||
| 	Source string `json:"source"` | ||||
| 	Depth  int    `json:"depth"` | ||||
| 	ID      string `json:"id"` | ||||
| 	Label   string `json:"label"` | ||||
| 	Type    string `json:"type"` | ||||
| 	Source  string `json:"source"` | ||||
| 	Depth   int    `json:"depth"` | ||||
| 	Enabled bool   `json:"enabled"` | ||||
| } | ||||
|   | ||||
| @@ -2,6 +2,7 @@ package api | ||||
|  | ||||
| import ( | ||||
| 	"git.solsynth.dev/hypernet/nexus/pkg/nex/sec" | ||||
| 	"git.solsynth.dev/hypernet/reader/pkg/internal/services" | ||||
| 	"github.com/gofiber/fiber/v2" | ||||
| ) | ||||
|  | ||||
| @@ -12,6 +13,11 @@ func MapAPIs(app *fiber.App, baseURL string) { | ||||
|  | ||||
| 		admin := api.Group("/admin").Name("Admin") | ||||
| 		{ | ||||
| 			admin.Get("/scan", func(c *fiber.Ctx) error { | ||||
| 				services.ScanNewsSources() | ||||
| 				return c.SendStatus(fiber.StatusOK) | ||||
| 			}) | ||||
|  | ||||
| 			admin.Post("/scan", sec.ValidatorMiddleware, adminTriggerScanTask) | ||||
| 		} | ||||
|  | ||||
|   | ||||
| @@ -16,7 +16,9 @@ func listNewsArticles(c *fiber.Ctx) error { | ||||
| 	} | ||||
|  | ||||
| 	var articles []models.NewsArticle | ||||
| 	if err := database.C.Limit(take).Offset(offset).Omit("Content").Find(&articles).Error; err != nil { | ||||
| 	if err := database.C.Limit(take).Offset(offset). | ||||
| 		Omit("Content").Order("created_at DESC"). | ||||
| 		Find(&articles).Error; err != nil { | ||||
| 		return fiber.NewError(fiber.StatusInternalServerError, err.Error()) | ||||
| 	} | ||||
|  | ||||
|   | ||||
| @@ -7,8 +7,10 @@ import ( | ||||
| 	"git.solsynth.dev/hypernet/reader/pkg/internal/database" | ||||
| 	"git.solsynth.dev/hypernet/reader/pkg/internal/models" | ||||
| 	"github.com/rs/zerolog/log" | ||||
| 	"github.com/samber/lo" | ||||
| 	"github.com/sogko/go-wordpress" | ||||
| 	"github.com/spf13/viper" | ||||
| 	"gorm.io/gorm/clause" | ||||
| ) | ||||
|  | ||||
| var NewsSources []models.NewsSource | ||||
| @@ -28,6 +30,10 @@ func ScanNewsSourcesNoEager() { | ||||
| func ScanNewsSources(eager ...bool) { | ||||
| 	var results []models.NewsArticle | ||||
| 	for _, src := range NewsSources { | ||||
| 		if !src.Enabled { | ||||
| 			continue | ||||
| 		} | ||||
|  | ||||
| 		log.Debug().Str("source", src.ID).Msg("Scanning news source...") | ||||
| 		result, err := NewsSourceRead(src) | ||||
| 		if err != nil { | ||||
| @@ -37,13 +43,22 @@ func ScanNewsSources(eager ...bool) { | ||||
| 		log.Info().Str("source", src.ID).Int("count", len(result)).Msg("Scanned a news sources.") | ||||
| 	} | ||||
| 	log.Info().Int("count", len(results)).Msg("Scanned all news sources.") | ||||
| 	database.C.Save(&results) | ||||
|  | ||||
| 	results = lo.UniqBy(results, func(item models.NewsArticle) string { | ||||
| 		return item.Hash | ||||
| 	}) | ||||
|  | ||||
| 	database.C.Clauses(clause.OnConflict{ | ||||
| 		UpdateAll: true, | ||||
| 	}).Create(&results) | ||||
| } | ||||
|  | ||||
| func NewsSourceRead(src models.NewsSource, eager ...bool) ([]models.NewsArticle, error) { | ||||
| 	switch src.Type { | ||||
| 	case "wordpress": | ||||
| 		return newsSourceReadWordpress(src) | ||||
| 		return newsSourceReadWordpress(src, eager...) | ||||
| 	case "scrap": | ||||
| 		return newsSourceReadScrap(src) | ||||
| 	default: | ||||
| 		return nil, fmt.Errorf("unsupported news source type: %s", src.Type) | ||||
| 	} | ||||
| @@ -93,3 +108,21 @@ func newsSourceReadWordpress(src models.NewsSource, eager ...bool) ([]models.New | ||||
|  | ||||
| 	return result, nil | ||||
| } | ||||
|  | ||||
| func newsSourceReadScrap(src models.NewsSource) ([]models.NewsArticle, error) { | ||||
| 	pgConvert := func(article models.NewsArticle) models.NewsArticle { | ||||
| 		art := &article | ||||
| 		art.GenHash() | ||||
| 		art.Source = src.ID | ||||
| 		article = *art | ||||
| 		return article | ||||
| 	} | ||||
|  | ||||
| 	result := ScrapNewsIndex(src.Source) | ||||
|  | ||||
| 	for idx, page := range result { | ||||
| 		result[idx] = pgConvert(page) | ||||
| 	} | ||||
|  | ||||
| 	return result, nil | ||||
| } | ||||
|   | ||||
| @@ -3,8 +3,11 @@ package services | ||||
| import ( | ||||
| 	"crypto/md5" | ||||
| 	"encoding/hex" | ||||
| 	"fmt" | ||||
| 	"net" | ||||
| 	"net/http" | ||||
| 	"net/url" | ||||
| 	"strings" | ||||
| 	"time" | ||||
|  | ||||
| 	"git.solsynth.dev/hypernet/reader/pkg/internal/database" | ||||
| @@ -122,3 +125,109 @@ func ScrapLink(target string) (*models.LinkMeta, error) { | ||||
|  | ||||
| 	return meta, c.Visit(target) | ||||
| } | ||||
|  | ||||
| const ScrapNewsDefaultUA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1.1 Safari/605.1.15" | ||||
|  | ||||
| func ScrapNewsIndex(target string) []models.NewsArticle { | ||||
| 	parsedTarget, err := url.Parse(target) | ||||
| 	if err != nil { | ||||
| 		return nil | ||||
| 	} | ||||
| 	baseUrl := fmt.Sprintf("%s://%s", parsedTarget.Scheme, parsedTarget.Host) | ||||
|  | ||||
| 	ua := viper.GetString("scraper.news_ua") | ||||
| 	if len(ua) == 0 { | ||||
| 		ua = ScrapNewsDefaultUA | ||||
| 	} | ||||
|  | ||||
| 	c := colly.NewCollector( | ||||
| 		colly.UserAgent(ua), | ||||
| 		colly.MaxDepth(3), | ||||
| 	) | ||||
|  | ||||
| 	c.WithTransport(&http.Transport{ | ||||
| 		Proxy: http.ProxyFromEnvironment, | ||||
| 		DialContext: (&net.Dialer{ | ||||
| 			Timeout:   60 * time.Second, | ||||
| 			KeepAlive: 360 * time.Second, | ||||
| 		}).DialContext, | ||||
| 		MaxIdleConns:          100, | ||||
| 		IdleConnTimeout:       90 * time.Second, | ||||
| 		TLSHandshakeTimeout:   10 * time.Second, | ||||
| 		ExpectContinueTimeout: 1 * time.Second, | ||||
| 	}) | ||||
|  | ||||
| 	var result []models.NewsArticle | ||||
|  | ||||
| 	c.OnHTML("main a", func(e *colly.HTMLElement) { | ||||
| 		url := e.Attr("href") | ||||
| 		if strings.HasPrefix(url, "#") || strings.HasPrefix(url, "javascript:") || strings.HasPrefix(url, "mailto:") { | ||||
| 			return | ||||
| 		} | ||||
| 		if !strings.HasPrefix(url, "http") { | ||||
| 			url = fmt.Sprintf("%s%s", baseUrl, url) | ||||
| 		} | ||||
|  | ||||
| 		article, err := ScrapNews(url) | ||||
| 		if err != nil { | ||||
| 			log.Warn().Err(err).Str("url", url).Msg("Failed to scrap a news article...") | ||||
| 			return | ||||
| 		} | ||||
|  | ||||
| 		log.Debug().Str("url", url).Msg("Scraped a news article...") | ||||
| 		if article != nil { | ||||
| 			result = append(result, *article) | ||||
| 		} | ||||
| 	}) | ||||
|  | ||||
| 	c.Visit(target) | ||||
|  | ||||
| 	return result | ||||
| } | ||||
|  | ||||
| func ScrapNews(target string) (*models.NewsArticle, error) { | ||||
| 	ua := viper.GetString("scraper.news_ua") | ||||
| 	if len(ua) == 0 { | ||||
| 		ua = ScrapNewsDefaultUA | ||||
| 	} | ||||
|  | ||||
| 	c := colly.NewCollector( | ||||
| 		colly.UserAgent(ua), | ||||
| 		colly.MaxDepth(3), | ||||
| 	) | ||||
|  | ||||
| 	c.WithTransport(&http.Transport{ | ||||
| 		Proxy: http.ProxyFromEnvironment, | ||||
| 		DialContext: (&net.Dialer{ | ||||
| 			Timeout:   60 * time.Second, | ||||
| 			KeepAlive: 360 * time.Second, | ||||
| 		}).DialContext, | ||||
| 		MaxIdleConns:          100, | ||||
| 		IdleConnTimeout:       90 * time.Second, | ||||
| 		TLSHandshakeTimeout:   10 * time.Second, | ||||
| 		ExpectContinueTimeout: 1 * time.Second, | ||||
| 	}) | ||||
|  | ||||
| 	article := &models.NewsArticle{ | ||||
| 		URL: target, | ||||
| 	} | ||||
|  | ||||
| 	c.OnHTML("title", func(e *colly.HTMLElement) { | ||||
| 		article.Title = e.Text | ||||
| 	}) | ||||
| 	c.OnHTML("meta[name]", func(e *colly.HTMLElement) { | ||||
| 		switch e.Attr("name") { | ||||
| 		case "description": | ||||
| 			article.Description = e.Attr("content") | ||||
| 		} | ||||
| 	}) | ||||
|  | ||||
| 	c.OnHTML("article", func(e *colly.HTMLElement) { | ||||
| 		article.Content, _ = e.DOM.Html() | ||||
| 	}) | ||||
| 	c.OnHTML("article img", func(e *colly.HTMLElement) { | ||||
| 		article.Thumbnail = e.Attr("src") | ||||
| 	}) | ||||
|  | ||||
| 	return article, c.Visit(target) | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user