✨ Scrap site following rss / atom feed
This commit is contained in:
@ -3,6 +3,7 @@ package models
|
||||
import (
|
||||
"crypto/md5"
|
||||
"encoding/hex"
|
||||
"time"
|
||||
|
||||
"git.solsynth.dev/hypernet/nexus/pkg/nex/cruda"
|
||||
"github.com/google/uuid"
|
||||
@ -11,13 +12,14 @@ import (
|
||||
type NewsArticle struct {
|
||||
cruda.BaseModel
|
||||
|
||||
Thumbnail string `json:"thumbnail"`
|
||||
Title string `json:"title"`
|
||||
Description string `json:"description"`
|
||||
Content string `json:"content"`
|
||||
URL string `json:"url"`
|
||||
Hash string `json:"hash" gorm:"uniqueIndex"`
|
||||
Source string `json:"source"`
|
||||
Thumbnail string `json:"thumbnail"`
|
||||
Title string `json:"title"`
|
||||
Description string `json:"description"`
|
||||
Content string `json:"content"`
|
||||
URL string `json:"url"`
|
||||
Hash string `json:"hash" gorm:"uniqueIndex"`
|
||||
Source string `json:"source"`
|
||||
PublishedAt *time.Time `json:"published_at"`
|
||||
}
|
||||
|
||||
func (v *NewsArticle) GenHash() *NewsArticle {
|
||||
|
@ -1,11 +1,14 @@
|
||||
package services
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"git.solsynth.dev/hypernet/reader/pkg/internal/database"
|
||||
"git.solsynth.dev/hypernet/reader/pkg/internal/models"
|
||||
"github.com/mmcdole/gofeed"
|
||||
"github.com/rs/zerolog/log"
|
||||
"github.com/samber/lo"
|
||||
"github.com/sogko/go-wordpress"
|
||||
@ -59,6 +62,8 @@ func NewsSourceRead(src models.NewsSource, eager ...bool) ([]models.NewsArticle,
|
||||
return newsSourceReadWordpress(src, eager...)
|
||||
case "scrap":
|
||||
return newsSourceReadScrap(src)
|
||||
case "feed":
|
||||
return newsSourceReadFeed(src)
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported news source type: %s", src.Type)
|
||||
}
|
||||
@ -73,6 +78,10 @@ func newsSourceReadWordpress(src models.NewsSource, eager ...bool) ([]models.New
|
||||
URL: post.Link,
|
||||
Source: src.ID,
|
||||
}
|
||||
time, err := time.Parse("2006-01-02T15:04:05", post.DateGMT)
|
||||
if err == nil {
|
||||
article.PublishedAt = &time
|
||||
}
|
||||
article.GenHash()
|
||||
return *article
|
||||
}
|
||||
@ -109,6 +118,51 @@ func newsSourceReadWordpress(src models.NewsSource, eager ...bool) ([]models.New
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func newsSourceReadFeed(src models.NewsSource) ([]models.NewsArticle, error) {
|
||||
pgConvert := func(article models.NewsArticle) models.NewsArticle {
|
||||
art := &article
|
||||
art.GenHash()
|
||||
art.Source = src.ID
|
||||
article = *art
|
||||
return article
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||||
defer cancel()
|
||||
fp := gofeed.NewParser()
|
||||
feed, _ := fp.ParseURLWithContext(src.Source, ctx)
|
||||
|
||||
var result []models.NewsArticle
|
||||
for _, item := range feed.Items {
|
||||
parent := models.NewsArticle{
|
||||
Title: item.Title,
|
||||
Description: item.Description,
|
||||
}
|
||||
if item.PublishedParsed != nil {
|
||||
parent.PublishedAt = item.PublishedParsed
|
||||
}
|
||||
if item.Image != nil {
|
||||
parent.Thumbnail = item.Image.URL
|
||||
}
|
||||
if len(item.Content) > 0 {
|
||||
// Good website, provide content, skip scraping of it
|
||||
parent.Content = item.Content
|
||||
result = append(result, pgConvert(parent))
|
||||
} else {
|
||||
article, err := ScrapNews(item.Link, parent)
|
||||
if err != nil {
|
||||
log.Warn().Err(err).Str("url", item.Link).Msg("Failed to scrap a news article...")
|
||||
continue
|
||||
}
|
||||
result = append(result, pgConvert(*article))
|
||||
}
|
||||
|
||||
log.Debug().Str("url", item.Link).Msg("Scraped a news article...")
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func newsSourceReadScrap(src models.NewsSource) ([]models.NewsArticle, error) {
|
||||
pgConvert := func(article models.NewsArticle) models.NewsArticle {
|
||||
art := &article
|
||||
|
@ -185,7 +185,7 @@ func ScrapNewsIndex(target string) []models.NewsArticle {
|
||||
return result
|
||||
}
|
||||
|
||||
func ScrapNews(target string) (*models.NewsArticle, error) {
|
||||
func ScrapNews(target string, parent ...models.NewsArticle) (*models.NewsArticle, error) {
|
||||
ua := viper.GetString("scraper.news_ua")
|
||||
if len(ua) == 0 {
|
||||
ua = ScrapNewsDefaultUA
|
||||
@ -212,21 +212,35 @@ func ScrapNews(target string) (*models.NewsArticle, error) {
|
||||
URL: target,
|
||||
}
|
||||
|
||||
if len(parent) > 0 {
|
||||
article.Content = parent[0].Content
|
||||
article.Thumbnail = parent[0].Thumbnail
|
||||
article.Description = parent[0].Description
|
||||
}
|
||||
|
||||
c.OnHTML("title", func(e *colly.HTMLElement) {
|
||||
article.Title = e.Text
|
||||
if len(article.Title) == 0 {
|
||||
article.Title = e.Text
|
||||
}
|
||||
})
|
||||
c.OnHTML("meta[name]", func(e *colly.HTMLElement) {
|
||||
switch e.Attr("name") {
|
||||
case "description":
|
||||
article.Description = e.Attr("content")
|
||||
if len(article.Description) == 0 {
|
||||
article.Description = e.Attr("content")
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
c.OnHTML("article", func(e *colly.HTMLElement) {
|
||||
article.Content, _ = e.DOM.Html()
|
||||
if len(article.Content) == 0 {
|
||||
article.Content, _ = e.DOM.Html()
|
||||
}
|
||||
})
|
||||
c.OnHTML("article img", func(e *colly.HTMLElement) {
|
||||
article.Thumbnail = e.Attr("src")
|
||||
if len(article.Thumbnail) == 0 {
|
||||
article.Thumbnail = e.Attr("src")
|
||||
}
|
||||
})
|
||||
|
||||
return article, c.Visit(target)
|
||||
|
Reference in New Issue
Block a user