diff --git a/README.md b/README.md index 67295cd..be35340 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ Papeer is a powerful **ereader internet vacuum**. It can scrape any website, rem ## Scrape a web page -The `get` command lets you retrieve the content of any web page. +The `get` command lets you retrieve the content of any web page or RSS feed. ``` Scrape URL content @@ -138,7 +138,7 @@ go get -u github.com/lapwat/papeer ```sh # use platform=darwin for MacOS platform=linux -release=0.4.2 +release=0.5.0 # download and extract curl -L https://github.com/lapwat/papeer/releases/download/v$release/papeer-v$release-$platform-amd64.tar.gz > papeer.tar.gz @@ -151,7 +151,7 @@ sudo mv papeer /usr/local/bin ### Windows -Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.4.2/papeer-v0.4.2-windows-amd64.exe.zip). +Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.5.0/papeer-v0.5.0-windows-amd64.exe.zip). ## MOBI support diff --git a/book/scraper.go b/book/scraper.go index 4b2e910..c4b2d2f 100644 --- a/book/scraper.go +++ b/book/scraper.go @@ -15,6 +15,7 @@ import ( "github.com/PuerkitoBio/goquery" readability "github.com/go-shiori/go-readability" colly "github.com/gocolly/colly/v2" + "github.com/mmcdole/gofeed" ) type ScrapeConfig struct { @@ -375,56 +376,75 @@ func GetPath(elm *goquery.Selection) string { } func GetLinks(url *urllib.URL, selector string, limit, offset int, reverse, include bool) ([]link, string, chapter, error) { - selectorSet := true - if len(selector) == 0 { - selector = "a" - selectorSet = false + var links []link + var pathMax string + + parser := gofeed.NewParser() + feed, err := parser.ParseURL(url.String()) + + if err == nil { + // RSS feed + + for _, item := range feed.Items { + links = append(links, NewLink(item.Link, item.Title)) + } + + pathMax = "RSS" + } else { + // HTML website + + selectorSet := true + if len(selector) == 0 { + selector = "a" + selectorSet = false + } + + pathLinks := map[string][]link{} + pathCount := map[string]int{} + pathMax = "" + + // visit and count link classes + c := colly.NewCollector() + c.OnHTML(selector, func(e *colly.HTMLElement) { + href := e.Attr("href") + text := strings.TrimSpace(e.Text) + path := GetPath(e.DOM) + key := path + + if selectorSet { + + // if selector is set, we use the selector specified by the user + + key = selector + pathLinks[key] = append(pathLinks[key], NewLink(href, text)) + pathCount[key] += 1 + pathMax = key + + } else { + + // if selector is not set, we compute the selector ourselves + + class := e.Attr("class") + // include the element class to make sure we have the same exact path for every link in the table of content + key = fmt.Sprintf("%s.%s", path, class) + + // we count this key if the link text is not empty + if text != "" { + pathLinks[key] = append(pathLinks[key], NewLink(href, text)) + pathCount[key] += len(text) + + if pathCount[key] > pathCount[pathMax] { + pathMax = key + } + } + + } + }) + c.Visit(url.String()) + + links = pathLinks[pathMax] } - pathLinks := map[string][]link{} - pathCount := map[string]int{} - pathMax := "" - - // visit and count link classes - c := colly.NewCollector() - c.OnHTML(selector, func(e *colly.HTMLElement) { - href := e.Attr("href") - text := strings.TrimSpace(e.Text) - path := GetPath(e.DOM) - key := path - - if selectorSet { - - // if selector is set, we use the selector specified by the user - - key = selector - pathLinks[key] = append(pathLinks[key], NewLink(href, text)) - pathCount[key] += 1 - pathMax = key - - } else { - - // if selector is not set, we compute the selector ourselves - - class := e.Attr("class") - // include the element class to make sure we have the same exact path for every link in the table of content - key = fmt.Sprintf("%s.%s", path, class) - - // we count this key if the link text is not empty - if text != "" { - pathLinks[key] = append(pathLinks[key], NewLink(href, text)) - pathCount[key] += len(text) - - if pathCount[key] > pathCount[pathMax] { - pathMax = key - } - } - - } - }) - c.Visit(url.String()) - - links := pathLinks[pathMax] if len(links) == 0 { return []link{}, pathMax, chapter{}, fmt.Errorf("no link found for selector: %s", selector) } diff --git a/book/scraper_test.go b/book/scraper_test.go index 79c8ce9..a5f305f 100644 --- a/book/scraper_test.go +++ b/book/scraper_test.go @@ -128,6 +128,22 @@ func TestSubChapters(t *testing.T) { } +func TestSubChaptersRSS(t *testing.T) { + + config0 := NewScrapeConfig() + config1 := NewScrapeConfig() + + c := NewChapterFromURL("https://blog.lapw.at/rss", "", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {}) + + got := len(c.SubChapters()) + want := 8 + + if got != want { + t.Errorf("got %v, wanted %v", got, want) + } + +} + func TestSubChaptersSelector(t *testing.T) { config0 := NewScrapeConfig() diff --git a/cmd/version.go b/cmd/version.go index 650c6b5..1ce8295 100644 --- a/cmd/version.go +++ b/cmd/version.go @@ -14,6 +14,6 @@ var versionCmd = &cobra.Command{ Use: "version", Short: "Print the version number of papeer", Run: func(cmd *cobra.Command, args []string) { - fmt.Println("papeer v0.4.2") + fmt.Println("papeer v0.5.0") }, } diff --git a/go.mod b/go.mod index 81afc60..c53a87c 100644 --- a/go.mod +++ b/go.mod @@ -29,10 +29,15 @@ require ( github.com/golang/protobuf v1.5.2 // indirect github.com/inconshreveable/mousetrap v1.0.0 // indirect github.com/jedib0t/go-pretty/v6 v6.2.4 // indirect + github.com/json-iterator/go v1.1.11 // indirect github.com/kennygrant/sanitize v1.2.4 // indirect github.com/mattn/go-isatty v0.0.14 // indirect github.com/mattn/go-runewidth v0.0.13 // indirect github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect + github.com/mmcdole/gofeed v1.1.3 // indirect + github.com/mmcdole/goxpp v0.0.0-20181012175147-0068e33feabf // indirect + github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421 // indirect + github.com/modern-go/reflect2 v1.0.1 // indirect github.com/rivo/uniseg v0.2.0 // indirect github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect github.com/schollz/progressbar/v3 v3.8.3 // indirect diff --git a/go.sum b/go.sum index f92817e..c5c6ab8 100644 --- a/go.sum +++ b/go.sum @@ -87,6 +87,7 @@ github.com/coreos/go-semver v0.3.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3Ee github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4= github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA= +github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= github.com/cpuguy83/go-md2man/v2 v2.0.0/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= @@ -235,6 +236,8 @@ github.com/jawher/mow.cli v1.1.0/go.mod h1:aNaQlc7ozF3vw6IJ2dHjp2ZFiA4ozMIYY6Pyu github.com/jedib0t/go-pretty/v6 v6.2.4 h1:wdaj2KHD2W+mz8JgJ/Q6L/T5dB7kyqEFI16eLq7GEmk= github.com/jedib0t/go-pretty/v6 v6.2.4/go.mod h1:+nE9fyyHGil+PuISTCrp7avEdo6bqoMwqZnuiK2r2a0= github.com/jonboulle/clockwork v0.1.0/go.mod h1:Ii8DK3G1RaLaWxj9trq07+26W01tbo22gdxWY5EU2bo= +github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= +github.com/json-iterator/go v1.1.11 h1:uVUAXhF2To8cbw/3xN3pxj6kk7TYKs98NIrTqPlMWAQ= github.com/json-iterator/go v1.1.11/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU= github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk= @@ -275,8 +278,14 @@ github.com/mitchellh/iochan v1.0.0/go.mod h1:JwYml1nuB7xOzsp52dPpHFffvOCDupsG0Qu github.com/mitchellh/mapstructure v0.0.0-20160808181253-ca63d7c062ee/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= github.com/mitchellh/mapstructure v1.4.1/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= +github.com/mmcdole/gofeed v1.1.3 h1:pdrvMb18jMSLidGp8j0pLvc9IGziX4vbmvVqmLH6z8o= +github.com/mmcdole/gofeed v1.1.3/go.mod h1:QQO3maftbOu+hiVOGOZDRLymqGQCos4zxbA4j89gMrE= +github.com/mmcdole/goxpp v0.0.0-20181012175147-0068e33feabf h1:sWGE2v+hO0Nd4yFU/S/mDBM5plIU8v/Qhfz41hkDIAI= +github.com/mmcdole/goxpp v0.0.0-20181012175147-0068e33feabf/go.mod h1:pasqhqstspkosTneA62Nc+2p9SOBBYAPbnmRRWPQ0V8= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421 h1:ZqeYNhU3OHLH3mGKHDcjJRFFRrJa6eAM5H+CtDdOsPc= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= +github.com/modern-go/reflect2 v1.0.1 h1:9f412s+6RmYXLWZSEzVVgPGK7C2PphHj5RJrvfx9AWI= github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U= @@ -355,6 +364,7 @@ github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fx github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= github.com/ugorji/go v1.1.4/go.mod h1:uQMGLiO92mf5W77hV/PUCpI3pbzQx3CRekS0kk+RGrc= +github.com/urfave/cli v1.22.3/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0= github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU= github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q= github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=