✨ Scrap site following rss / atom feed

2025-01-26 00:12:50 +08:00
parent 5aae59a629
commit 4b57fd9f19
6 changed files with 128 additions and 18 deletions
--- a/pkg/internal/models/new.go
+++ b/pkg/internal/models/new.go
@@ -3,6 +3,7 @@ package models
 import (
 	"crypto/md5"
 	"encoding/hex"
+	"time"

 	"git.solsynth.dev/hypernet/nexus/pkg/nex/cruda"
 	"github.com/google/uuid"
@@ -11,13 +12,14 @@ import (
 type NewsArticle struct {
 	cruda.BaseModel

-	Thumbnail   string `json:"thumbnail"`
-	Title       string `json:"title"`
-	Description string `json:"description"`
-	Content     string `json:"content"`
-	URL         string `json:"url"`
-	Hash        string `json:"hash" gorm:"uniqueIndex"`
-	Source      string `json:"source"`
+	Thumbnail   string     `json:"thumbnail"`
+	Title       string     `json:"title"`
+	Description string     `json:"description"`
+	Content     string     `json:"content"`
+	URL         string     `json:"url"`
+	Hash        string     `json:"hash" gorm:"uniqueIndex"`
+	Source      string     `json:"source"`
+	PublishedAt *time.Time `json:"published_at"`
 }

 func (v *NewsArticle) GenHash() *NewsArticle {
--- a/pkg/internal/services/reader.go
+++ b/pkg/internal/services/reader.go
@@ -1,11 +1,14 @@
 package services

 import (
+	"context"
 	"fmt"
 	"strconv"
+	"time"

 	"git.solsynth.dev/hypernet/reader/pkg/internal/database"
 	"git.solsynth.dev/hypernet/reader/pkg/internal/models"
+	"github.com/mmcdole/gofeed"
 	"github.com/rs/zerolog/log"
 	"github.com/samber/lo"
 	"github.com/sogko/go-wordpress"
@@ -59,6 +62,8 @@ func NewsSourceRead(src models.NewsSource, eager ...bool) ([]models.NewsArticle,
 		return newsSourceReadWordpress(src, eager...)
 	case "scrap":
 		return newsSourceReadScrap(src)
+	case "feed":
+		return newsSourceReadFeed(src)
 	default:
 		return nil, fmt.Errorf("unsupported news source type: %s", src.Type)
 	}
@@ -73,6 +78,10 @@ func newsSourceReadWordpress(src models.NewsSource, eager ...bool) ([]models.New
 			URL:         post.Link,
 			Source:      src.ID,
 		}
+		time, err := time.Parse("2006-01-02T15:04:05", post.DateGMT)
+		if err == nil {
+			article.PublishedAt = &time
+		}
 		article.GenHash()
 		return *article
 	}
@@ -109,6 +118,51 @@ func newsSourceReadWordpress(src models.NewsSource, eager ...bool) ([]models.New
 	return result, nil
 }

+func newsSourceReadFeed(src models.NewsSource) ([]models.NewsArticle, error) {
+	pgConvert := func(article models.NewsArticle) models.NewsArticle {
+		art := &article
+		art.GenHash()
+		art.Source = src.ID
+		article = *art
+		return article
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
+	defer cancel()
+	fp := gofeed.NewParser()
+	feed, _ := fp.ParseURLWithContext(src.Source, ctx)
+
+	var result []models.NewsArticle
+	for _, item := range feed.Items {
+		parent := models.NewsArticle{
+			Title:       item.Title,
+			Description: item.Description,
+		}
+		if item.PublishedParsed != nil {
+			parent.PublishedAt = item.PublishedParsed
+		}
+		if item.Image != nil {
+			parent.Thumbnail = item.Image.URL
+		}
+		if len(item.Content) > 0 {
+			// Good website, provide content, skip scraping of it
+			parent.Content = item.Content
+			result = append(result, pgConvert(parent))
+		} else {
+			article, err := ScrapNews(item.Link, parent)
+			if err != nil {
+				log.Warn().Err(err).Str("url", item.Link).Msg("Failed to scrap a news article...")
+				continue
+			}
+			result = append(result, pgConvert(*article))
+		}
+
+		log.Debug().Str("url", item.Link).Msg("Scraped a news article...")
+	}
+
+	return result, nil
+}
+
 func newsSourceReadScrap(src models.NewsSource) ([]models.NewsArticle, error) {
 	pgConvert := func(article models.NewsArticle) models.NewsArticle {
 		art := &article
--- a/pkg/internal/services/scraper.go
+++ b/pkg/internal/services/scraper.go
@@ -185,7 +185,7 @@ func ScrapNewsIndex(target string) []models.NewsArticle {
 	return result
 }

-func ScrapNews(target string) (*models.NewsArticle, error) {
+func ScrapNews(target string, parent ...models.NewsArticle) (*models.NewsArticle, error) {
 	ua := viper.GetString("scraper.news_ua")
 	if len(ua) == 0 {
 		ua = ScrapNewsDefaultUA
@@ -212,21 +212,35 @@ func ScrapNews(target string) (*models.NewsArticle, error) {
 		URL: target,
 	}

+	if len(parent) > 0 {
+		article.Content = parent[0].Content
+		article.Thumbnail = parent[0].Thumbnail
+		article.Description = parent[0].Description
+	}
+
 	c.OnHTML("title", func(e *colly.HTMLElement) {
-		article.Title = e.Text
+		if len(article.Title) == 0 {
+			article.Title = e.Text
+		}
 	})
 	c.OnHTML("meta[name]", func(e *colly.HTMLElement) {
 		switch e.Attr("name") {
 		case "description":
-			article.Description = e.Attr("content")
+			if len(article.Description) == 0 {
+				article.Description = e.Attr("content")
+			}
 		}
 	})

 	c.OnHTML("article", func(e *colly.HTMLElement) {
-		article.Content, _ = e.DOM.Html()
+		if len(article.Content) == 0 {
+			article.Content, _ = e.DOM.Html()
+		}
 	})
 	c.OnHTML("article img", func(e *colly.HTMLElement) {
-		article.Thumbnail = e.Attr("src")
+		if len(article.Thumbnail) == 0 {
+			article.Thumbnail = e.Attr("src")
+		}
 	})

 	return article, c.Visit(target)