✨ Scrap site following rss / atom feed
This commit is contained in:
		| @@ -3,6 +3,7 @@ package models | ||||
| import ( | ||||
| 	"crypto/md5" | ||||
| 	"encoding/hex" | ||||
| 	"time" | ||||
|  | ||||
| 	"git.solsynth.dev/hypernet/nexus/pkg/nex/cruda" | ||||
| 	"github.com/google/uuid" | ||||
| @@ -11,13 +12,14 @@ import ( | ||||
| type NewsArticle struct { | ||||
| 	cruda.BaseModel | ||||
|  | ||||
| 	Thumbnail   string `json:"thumbnail"` | ||||
| 	Title       string `json:"title"` | ||||
| 	Description string `json:"description"` | ||||
| 	Content     string `json:"content"` | ||||
| 	URL         string `json:"url"` | ||||
| 	Hash        string `json:"hash" gorm:"uniqueIndex"` | ||||
| 	Source      string `json:"source"` | ||||
| 	Thumbnail   string     `json:"thumbnail"` | ||||
| 	Title       string     `json:"title"` | ||||
| 	Description string     `json:"description"` | ||||
| 	Content     string     `json:"content"` | ||||
| 	URL         string     `json:"url"` | ||||
| 	Hash        string     `json:"hash" gorm:"uniqueIndex"` | ||||
| 	Source      string     `json:"source"` | ||||
| 	PublishedAt *time.Time `json:"published_at"` | ||||
| } | ||||
|  | ||||
| func (v *NewsArticle) GenHash() *NewsArticle { | ||||
|   | ||||
| @@ -1,11 +1,14 @@ | ||||
| package services | ||||
|  | ||||
| import ( | ||||
| 	"context" | ||||
| 	"fmt" | ||||
| 	"strconv" | ||||
| 	"time" | ||||
|  | ||||
| 	"git.solsynth.dev/hypernet/reader/pkg/internal/database" | ||||
| 	"git.solsynth.dev/hypernet/reader/pkg/internal/models" | ||||
| 	"github.com/mmcdole/gofeed" | ||||
| 	"github.com/rs/zerolog/log" | ||||
| 	"github.com/samber/lo" | ||||
| 	"github.com/sogko/go-wordpress" | ||||
| @@ -59,6 +62,8 @@ func NewsSourceRead(src models.NewsSource, eager ...bool) ([]models.NewsArticle, | ||||
| 		return newsSourceReadWordpress(src, eager...) | ||||
| 	case "scrap": | ||||
| 		return newsSourceReadScrap(src) | ||||
| 	case "feed": | ||||
| 		return newsSourceReadFeed(src) | ||||
| 	default: | ||||
| 		return nil, fmt.Errorf("unsupported news source type: %s", src.Type) | ||||
| 	} | ||||
| @@ -73,6 +78,10 @@ func newsSourceReadWordpress(src models.NewsSource, eager ...bool) ([]models.New | ||||
| 			URL:         post.Link, | ||||
| 			Source:      src.ID, | ||||
| 		} | ||||
| 		time, err := time.Parse("2006-01-02T15:04:05", post.DateGMT) | ||||
| 		if err == nil { | ||||
| 			article.PublishedAt = &time | ||||
| 		} | ||||
| 		article.GenHash() | ||||
| 		return *article | ||||
| 	} | ||||
| @@ -109,6 +118,51 @@ func newsSourceReadWordpress(src models.NewsSource, eager ...bool) ([]models.New | ||||
| 	return result, nil | ||||
| } | ||||
|  | ||||
| func newsSourceReadFeed(src models.NewsSource) ([]models.NewsArticle, error) { | ||||
| 	pgConvert := func(article models.NewsArticle) models.NewsArticle { | ||||
| 		art := &article | ||||
| 		art.GenHash() | ||||
| 		art.Source = src.ID | ||||
| 		article = *art | ||||
| 		return article | ||||
| 	} | ||||
|  | ||||
| 	ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) | ||||
| 	defer cancel() | ||||
| 	fp := gofeed.NewParser() | ||||
| 	feed, _ := fp.ParseURLWithContext(src.Source, ctx) | ||||
|  | ||||
| 	var result []models.NewsArticle | ||||
| 	for _, item := range feed.Items { | ||||
| 		parent := models.NewsArticle{ | ||||
| 			Title:       item.Title, | ||||
| 			Description: item.Description, | ||||
| 		} | ||||
| 		if item.PublishedParsed != nil { | ||||
| 			parent.PublishedAt = item.PublishedParsed | ||||
| 		} | ||||
| 		if item.Image != nil { | ||||
| 			parent.Thumbnail = item.Image.URL | ||||
| 		} | ||||
| 		if len(item.Content) > 0 { | ||||
| 			// Good website, provide content, skip scraping of it | ||||
| 			parent.Content = item.Content | ||||
| 			result = append(result, pgConvert(parent)) | ||||
| 		} else { | ||||
| 			article, err := ScrapNews(item.Link, parent) | ||||
| 			if err != nil { | ||||
| 				log.Warn().Err(err).Str("url", item.Link).Msg("Failed to scrap a news article...") | ||||
| 				continue | ||||
| 			} | ||||
| 			result = append(result, pgConvert(*article)) | ||||
| 		} | ||||
|  | ||||
| 		log.Debug().Str("url", item.Link).Msg("Scraped a news article...") | ||||
| 	} | ||||
|  | ||||
| 	return result, nil | ||||
| } | ||||
|  | ||||
| func newsSourceReadScrap(src models.NewsSource) ([]models.NewsArticle, error) { | ||||
| 	pgConvert := func(article models.NewsArticle) models.NewsArticle { | ||||
| 		art := &article | ||||
|   | ||||
| @@ -185,7 +185,7 @@ func ScrapNewsIndex(target string) []models.NewsArticle { | ||||
| 	return result | ||||
| } | ||||
|  | ||||
| func ScrapNews(target string) (*models.NewsArticle, error) { | ||||
| func ScrapNews(target string, parent ...models.NewsArticle) (*models.NewsArticle, error) { | ||||
| 	ua := viper.GetString("scraper.news_ua") | ||||
| 	if len(ua) == 0 { | ||||
| 		ua = ScrapNewsDefaultUA | ||||
| @@ -212,21 +212,35 @@ func ScrapNews(target string) (*models.NewsArticle, error) { | ||||
| 		URL: target, | ||||
| 	} | ||||
|  | ||||
| 	if len(parent) > 0 { | ||||
| 		article.Content = parent[0].Content | ||||
| 		article.Thumbnail = parent[0].Thumbnail | ||||
| 		article.Description = parent[0].Description | ||||
| 	} | ||||
|  | ||||
| 	c.OnHTML("title", func(e *colly.HTMLElement) { | ||||
| 		article.Title = e.Text | ||||
| 		if len(article.Title) == 0 { | ||||
| 			article.Title = e.Text | ||||
| 		} | ||||
| 	}) | ||||
| 	c.OnHTML("meta[name]", func(e *colly.HTMLElement) { | ||||
| 		switch e.Attr("name") { | ||||
| 		case "description": | ||||
| 			article.Description = e.Attr("content") | ||||
| 			if len(article.Description) == 0 { | ||||
| 				article.Description = e.Attr("content") | ||||
| 			} | ||||
| 		} | ||||
| 	}) | ||||
|  | ||||
| 	c.OnHTML("article", func(e *colly.HTMLElement) { | ||||
| 		article.Content, _ = e.DOM.Html() | ||||
| 		if len(article.Content) == 0 { | ||||
| 			article.Content, _ = e.DOM.Html() | ||||
| 		} | ||||
| 	}) | ||||
| 	c.OnHTML("article img", func(e *colly.HTMLElement) { | ||||
| 		article.Thumbnail = e.Attr("src") | ||||
| 		if len(article.Thumbnail) == 0 { | ||||
| 			article.Thumbnail = e.Attr("src") | ||||
| 		} | ||||
| 	}) | ||||
|  | ||||
| 	return article, c.Visit(target) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user