✨ Scrap news site
This commit is contained in:
parent
48a04b09a4
commit
5aae59a629
@ -6,4 +6,5 @@ type NewsSource struct {
|
|||||||
Type string `json:"type"`
|
Type string `json:"type"`
|
||||||
Source string `json:"source"`
|
Source string `json:"source"`
|
||||||
Depth int `json:"depth"`
|
Depth int `json:"depth"`
|
||||||
|
Enabled bool `json:"enabled"`
|
||||||
}
|
}
|
||||||
|
@ -2,6 +2,7 @@ package api
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"git.solsynth.dev/hypernet/nexus/pkg/nex/sec"
|
"git.solsynth.dev/hypernet/nexus/pkg/nex/sec"
|
||||||
|
"git.solsynth.dev/hypernet/reader/pkg/internal/services"
|
||||||
"github.com/gofiber/fiber/v2"
|
"github.com/gofiber/fiber/v2"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -12,6 +13,11 @@ func MapAPIs(app *fiber.App, baseURL string) {
|
|||||||
|
|
||||||
admin := api.Group("/admin").Name("Admin")
|
admin := api.Group("/admin").Name("Admin")
|
||||||
{
|
{
|
||||||
|
admin.Get("/scan", func(c *fiber.Ctx) error {
|
||||||
|
services.ScanNewsSources()
|
||||||
|
return c.SendStatus(fiber.StatusOK)
|
||||||
|
})
|
||||||
|
|
||||||
admin.Post("/scan", sec.ValidatorMiddleware, adminTriggerScanTask)
|
admin.Post("/scan", sec.ValidatorMiddleware, adminTriggerScanTask)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -16,7 +16,9 @@ func listNewsArticles(c *fiber.Ctx) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var articles []models.NewsArticle
|
var articles []models.NewsArticle
|
||||||
if err := database.C.Limit(take).Offset(offset).Omit("Content").Find(&articles).Error; err != nil {
|
if err := database.C.Limit(take).Offset(offset).
|
||||||
|
Omit("Content").Order("created_at DESC").
|
||||||
|
Find(&articles).Error; err != nil {
|
||||||
return fiber.NewError(fiber.StatusInternalServerError, err.Error())
|
return fiber.NewError(fiber.StatusInternalServerError, err.Error())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7,8 +7,10 @@ import (
|
|||||||
"git.solsynth.dev/hypernet/reader/pkg/internal/database"
|
"git.solsynth.dev/hypernet/reader/pkg/internal/database"
|
||||||
"git.solsynth.dev/hypernet/reader/pkg/internal/models"
|
"git.solsynth.dev/hypernet/reader/pkg/internal/models"
|
||||||
"github.com/rs/zerolog/log"
|
"github.com/rs/zerolog/log"
|
||||||
|
"github.com/samber/lo"
|
||||||
"github.com/sogko/go-wordpress"
|
"github.com/sogko/go-wordpress"
|
||||||
"github.com/spf13/viper"
|
"github.com/spf13/viper"
|
||||||
|
"gorm.io/gorm/clause"
|
||||||
)
|
)
|
||||||
|
|
||||||
var NewsSources []models.NewsSource
|
var NewsSources []models.NewsSource
|
||||||
@ -28,6 +30,10 @@ func ScanNewsSourcesNoEager() {
|
|||||||
func ScanNewsSources(eager ...bool) {
|
func ScanNewsSources(eager ...bool) {
|
||||||
var results []models.NewsArticle
|
var results []models.NewsArticle
|
||||||
for _, src := range NewsSources {
|
for _, src := range NewsSources {
|
||||||
|
if !src.Enabled {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
log.Debug().Str("source", src.ID).Msg("Scanning news source...")
|
log.Debug().Str("source", src.ID).Msg("Scanning news source...")
|
||||||
result, err := NewsSourceRead(src)
|
result, err := NewsSourceRead(src)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -37,13 +43,22 @@ func ScanNewsSources(eager ...bool) {
|
|||||||
log.Info().Str("source", src.ID).Int("count", len(result)).Msg("Scanned a news sources.")
|
log.Info().Str("source", src.ID).Int("count", len(result)).Msg("Scanned a news sources.")
|
||||||
}
|
}
|
||||||
log.Info().Int("count", len(results)).Msg("Scanned all news sources.")
|
log.Info().Int("count", len(results)).Msg("Scanned all news sources.")
|
||||||
database.C.Save(&results)
|
|
||||||
|
results = lo.UniqBy(results, func(item models.NewsArticle) string {
|
||||||
|
return item.Hash
|
||||||
|
})
|
||||||
|
|
||||||
|
database.C.Clauses(clause.OnConflict{
|
||||||
|
UpdateAll: true,
|
||||||
|
}).Create(&results)
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewsSourceRead(src models.NewsSource, eager ...bool) ([]models.NewsArticle, error) {
|
func NewsSourceRead(src models.NewsSource, eager ...bool) ([]models.NewsArticle, error) {
|
||||||
switch src.Type {
|
switch src.Type {
|
||||||
case "wordpress":
|
case "wordpress":
|
||||||
return newsSourceReadWordpress(src)
|
return newsSourceReadWordpress(src, eager...)
|
||||||
|
case "scrap":
|
||||||
|
return newsSourceReadScrap(src)
|
||||||
default:
|
default:
|
||||||
return nil, fmt.Errorf("unsupported news source type: %s", src.Type)
|
return nil, fmt.Errorf("unsupported news source type: %s", src.Type)
|
||||||
}
|
}
|
||||||
@ -93,3 +108,21 @@ func newsSourceReadWordpress(src models.NewsSource, eager ...bool) ([]models.New
|
|||||||
|
|
||||||
return result, nil
|
return result, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func newsSourceReadScrap(src models.NewsSource) ([]models.NewsArticle, error) {
|
||||||
|
pgConvert := func(article models.NewsArticle) models.NewsArticle {
|
||||||
|
art := &article
|
||||||
|
art.GenHash()
|
||||||
|
art.Source = src.ID
|
||||||
|
article = *art
|
||||||
|
return article
|
||||||
|
}
|
||||||
|
|
||||||
|
result := ScrapNewsIndex(src.Source)
|
||||||
|
|
||||||
|
for idx, page := range result {
|
||||||
|
result[idx] = pgConvert(page)
|
||||||
|
}
|
||||||
|
|
||||||
|
return result, nil
|
||||||
|
}
|
||||||
|
@ -3,8 +3,11 @@ package services
|
|||||||
import (
|
import (
|
||||||
"crypto/md5"
|
"crypto/md5"
|
||||||
"encoding/hex"
|
"encoding/hex"
|
||||||
|
"fmt"
|
||||||
"net"
|
"net"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
"net/url"
|
||||||
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"git.solsynth.dev/hypernet/reader/pkg/internal/database"
|
"git.solsynth.dev/hypernet/reader/pkg/internal/database"
|
||||||
@ -122,3 +125,109 @@ func ScrapLink(target string) (*models.LinkMeta, error) {
|
|||||||
|
|
||||||
return meta, c.Visit(target)
|
return meta, c.Visit(target)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const ScrapNewsDefaultUA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1.1 Safari/605.1.15"
|
||||||
|
|
||||||
|
func ScrapNewsIndex(target string) []models.NewsArticle {
|
||||||
|
parsedTarget, err := url.Parse(target)
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
baseUrl := fmt.Sprintf("%s://%s", parsedTarget.Scheme, parsedTarget.Host)
|
||||||
|
|
||||||
|
ua := viper.GetString("scraper.news_ua")
|
||||||
|
if len(ua) == 0 {
|
||||||
|
ua = ScrapNewsDefaultUA
|
||||||
|
}
|
||||||
|
|
||||||
|
c := colly.NewCollector(
|
||||||
|
colly.UserAgent(ua),
|
||||||
|
colly.MaxDepth(3),
|
||||||
|
)
|
||||||
|
|
||||||
|
c.WithTransport(&http.Transport{
|
||||||
|
Proxy: http.ProxyFromEnvironment,
|
||||||
|
DialContext: (&net.Dialer{
|
||||||
|
Timeout: 60 * time.Second,
|
||||||
|
KeepAlive: 360 * time.Second,
|
||||||
|
}).DialContext,
|
||||||
|
MaxIdleConns: 100,
|
||||||
|
IdleConnTimeout: 90 * time.Second,
|
||||||
|
TLSHandshakeTimeout: 10 * time.Second,
|
||||||
|
ExpectContinueTimeout: 1 * time.Second,
|
||||||
|
})
|
||||||
|
|
||||||
|
var result []models.NewsArticle
|
||||||
|
|
||||||
|
c.OnHTML("main a", func(e *colly.HTMLElement) {
|
||||||
|
url := e.Attr("href")
|
||||||
|
if strings.HasPrefix(url, "#") || strings.HasPrefix(url, "javascript:") || strings.HasPrefix(url, "mailto:") {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if !strings.HasPrefix(url, "http") {
|
||||||
|
url = fmt.Sprintf("%s%s", baseUrl, url)
|
||||||
|
}
|
||||||
|
|
||||||
|
article, err := ScrapNews(url)
|
||||||
|
if err != nil {
|
||||||
|
log.Warn().Err(err).Str("url", url).Msg("Failed to scrap a news article...")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Debug().Str("url", url).Msg("Scraped a news article...")
|
||||||
|
if article != nil {
|
||||||
|
result = append(result, *article)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
c.Visit(target)
|
||||||
|
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
func ScrapNews(target string) (*models.NewsArticle, error) {
|
||||||
|
ua := viper.GetString("scraper.news_ua")
|
||||||
|
if len(ua) == 0 {
|
||||||
|
ua = ScrapNewsDefaultUA
|
||||||
|
}
|
||||||
|
|
||||||
|
c := colly.NewCollector(
|
||||||
|
colly.UserAgent(ua),
|
||||||
|
colly.MaxDepth(3),
|
||||||
|
)
|
||||||
|
|
||||||
|
c.WithTransport(&http.Transport{
|
||||||
|
Proxy: http.ProxyFromEnvironment,
|
||||||
|
DialContext: (&net.Dialer{
|
||||||
|
Timeout: 60 * time.Second,
|
||||||
|
KeepAlive: 360 * time.Second,
|
||||||
|
}).DialContext,
|
||||||
|
MaxIdleConns: 100,
|
||||||
|
IdleConnTimeout: 90 * time.Second,
|
||||||
|
TLSHandshakeTimeout: 10 * time.Second,
|
||||||
|
ExpectContinueTimeout: 1 * time.Second,
|
||||||
|
})
|
||||||
|
|
||||||
|
article := &models.NewsArticle{
|
||||||
|
URL: target,
|
||||||
|
}
|
||||||
|
|
||||||
|
c.OnHTML("title", func(e *colly.HTMLElement) {
|
||||||
|
article.Title = e.Text
|
||||||
|
})
|
||||||
|
c.OnHTML("meta[name]", func(e *colly.HTMLElement) {
|
||||||
|
switch e.Attr("name") {
|
||||||
|
case "description":
|
||||||
|
article.Description = e.Attr("content")
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
c.OnHTML("article", func(e *colly.HTMLElement) {
|
||||||
|
article.Content, _ = e.DOM.Html()
|
||||||
|
})
|
||||||
|
c.OnHTML("article img", func(e *colly.HTMLElement) {
|
||||||
|
article.Thumbnail = e.Attr("src")
|
||||||
|
})
|
||||||
|
|
||||||
|
return article, c.Visit(target)
|
||||||
|
}
|
||||||
|
@ -10,6 +10,19 @@ id = "shadiao"
|
|||||||
label = "Shadiao Plus"
|
label = "Shadiao Plus"
|
||||||
type = "wordpress"
|
type = "wordpress"
|
||||||
source = "https://shadiao.plus/wp-json/wp/v2"
|
source = "https://shadiao.plus/wp-json/wp/v2"
|
||||||
|
enabled = false
|
||||||
|
[[sources]]
|
||||||
|
id = "cnn-politics"
|
||||||
|
label = "CNN Politics"
|
||||||
|
type = "scrap"
|
||||||
|
source = "https://www.cnn.com/politics"
|
||||||
|
enabled = false
|
||||||
|
[[sources]]
|
||||||
|
id = "bbc"
|
||||||
|
label = "BBC News"
|
||||||
|
type = "scrap"
|
||||||
|
source = "https://www.bbc.com/news"
|
||||||
|
enabled = true
|
||||||
|
|
||||||
[debug]
|
[debug]
|
||||||
database = true
|
database = true
|
||||||
|
Loading…
x
Reference in New Issue
Block a user