handle RSS feed

This commit is contained in:
lapwat
2022-03-02 19:13:43 +01:00
parent be69854b17
commit 13c9138d01
6 changed files with 103 additions and 52 deletions

View File

@@ -15,6 +15,7 @@ import (
"github.com/PuerkitoBio/goquery"
readability "github.com/go-shiori/go-readability"
colly "github.com/gocolly/colly/v2"
"github.com/mmcdole/gofeed"
)
type ScrapeConfig struct {
@@ -375,56 +376,75 @@ func GetPath(elm *goquery.Selection) string {
}
func GetLinks(url *urllib.URL, selector string, limit, offset int, reverse, include bool) ([]link, string, chapter, error) {
selectorSet := true
if len(selector) == 0 {
selector = "a"
selectorSet = false
var links []link
var pathMax string
parser := gofeed.NewParser()
feed, err := parser.ParseURL(url.String())
if err == nil {
// RSS feed
for _, item := range feed.Items {
links = append(links, NewLink(item.Link, item.Title))
}
pathMax = "RSS"
} else {
// HTML website
selectorSet := true
if len(selector) == 0 {
selector = "a"
selectorSet = false
}
pathLinks := map[string][]link{}
pathCount := map[string]int{}
pathMax = ""
// visit and count link classes
c := colly.NewCollector()
c.OnHTML(selector, func(e *colly.HTMLElement) {
href := e.Attr("href")
text := strings.TrimSpace(e.Text)
path := GetPath(e.DOM)
key := path
if selectorSet {
// if selector is set, we use the selector specified by the user
key = selector
pathLinks[key] = append(pathLinks[key], NewLink(href, text))
pathCount[key] += 1
pathMax = key
} else {
// if selector is not set, we compute the selector ourselves
class := e.Attr("class")
// include the element class to make sure we have the same exact path for every link in the table of content
key = fmt.Sprintf("%s.%s", path, class)
// we count this key if the link text is not empty
if text != "" {
pathLinks[key] = append(pathLinks[key], NewLink(href, text))
pathCount[key] += len(text)
if pathCount[key] > pathCount[pathMax] {
pathMax = key
}
}
}
})
c.Visit(url.String())
links = pathLinks[pathMax]
}
pathLinks := map[string][]link{}
pathCount := map[string]int{}
pathMax := ""
// visit and count link classes
c := colly.NewCollector()
c.OnHTML(selector, func(e *colly.HTMLElement) {
href := e.Attr("href")
text := strings.TrimSpace(e.Text)
path := GetPath(e.DOM)
key := path
if selectorSet {
// if selector is set, we use the selector specified by the user
key = selector
pathLinks[key] = append(pathLinks[key], NewLink(href, text))
pathCount[key] += 1
pathMax = key
} else {
// if selector is not set, we compute the selector ourselves
class := e.Attr("class")
// include the element class to make sure we have the same exact path for every link in the table of content
key = fmt.Sprintf("%s.%s", path, class)
// we count this key if the link text is not empty
if text != "" {
pathLinks[key] = append(pathLinks[key], NewLink(href, text))
pathCount[key] += len(text)
if pathCount[key] > pathCount[pathMax] {
pathMax = key
}
}
}
})
c.Visit(url.String())
links := pathLinks[pathMax]
if len(links) == 0 {
return []link{}, pathMax, chapter{}, fmt.Errorf("no link found for selector: %s", selector)
}

View File

@@ -128,6 +128,22 @@ func TestSubChapters(t *testing.T) {
}
func TestSubChaptersRSS(t *testing.T) {
config0 := NewScrapeConfig()
config1 := NewScrapeConfig()
c := NewChapterFromURL("https://blog.lapw.at/rss", "", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {})
got := len(c.SubChapters())
want := 8
if got != want {
t.Errorf("got %v, wanted %v", got, want)
}
}
func TestSubChaptersSelector(t *testing.T) {
config0 := NewScrapeConfig()