mirror of
https://github.com/NohamR/papeer.git
synced 2026-05-25 12:27:20 +00:00
Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
108e1bdd1a |
15
.github/workflows/format.yml
vendored
Normal file
15
.github/workflows/format.yml
vendored
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
name: Format
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
paths:
|
||||||
|
- '**.go'
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
format:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v2
|
||||||
|
- name: Check formatting
|
||||||
|
run: if [[ -n "$(gofmt -l .)" ]]; then exit 1; fi
|
||||||
15
.github/workflows/test.yml
vendored
Normal file
15
.github/workflows/test.yml
vendored
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
name: Test
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
paths:
|
||||||
|
- '**.go'
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
test:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v2
|
||||||
|
- name: Test
|
||||||
|
run: make test
|
||||||
@@ -24,7 +24,7 @@ Papeer is a powerful **ereader internet vacuum**. It can scrape any website, rem
|
|||||||
|
|
||||||
## Scrape a web page
|
## Scrape a web page
|
||||||
|
|
||||||
The `get` command lets you retrieve the content of any web page.
|
The `get` command lets you retrieve the content of any web page or RSS feed.
|
||||||
|
|
||||||
```
|
```
|
||||||
Scrape URL content
|
Scrape URL content
|
||||||
@@ -138,7 +138,7 @@ go get -u github.com/lapwat/papeer
|
|||||||
```sh
|
```sh
|
||||||
# use platform=darwin for MacOS
|
# use platform=darwin for MacOS
|
||||||
platform=linux
|
platform=linux
|
||||||
release=0.4.2
|
release=0.5.0
|
||||||
|
|
||||||
# download and extract
|
# download and extract
|
||||||
curl -L https://github.com/lapwat/papeer/releases/download/v$release/papeer-v$release-$platform-amd64.tar.gz > papeer.tar.gz
|
curl -L https://github.com/lapwat/papeer/releases/download/v$release/papeer-v$release-$platform-amd64.tar.gz > papeer.tar.gz
|
||||||
@@ -151,7 +151,7 @@ sudo mv papeer /usr/local/bin
|
|||||||
|
|
||||||
### Windows
|
### Windows
|
||||||
|
|
||||||
Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.4.2/papeer-v0.4.2-windows-amd64.exe.zip).
|
Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.5.0/papeer-v0.5.0-windows-amd64.exe.zip).
|
||||||
|
|
||||||
## MOBI support
|
## MOBI support
|
||||||
|
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ import (
|
|||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
readability "github.com/go-shiori/go-readability"
|
readability "github.com/go-shiori/go-readability"
|
||||||
colly "github.com/gocolly/colly/v2"
|
colly "github.com/gocolly/colly/v2"
|
||||||
|
"github.com/mmcdole/gofeed"
|
||||||
)
|
)
|
||||||
|
|
||||||
type ScrapeConfig struct {
|
type ScrapeConfig struct {
|
||||||
@@ -375,6 +376,23 @@ func GetPath(elm *goquery.Selection) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func GetLinks(url *urllib.URL, selector string, limit, offset int, reverse, include bool) ([]link, string, chapter, error) {
|
func GetLinks(url *urllib.URL, selector string, limit, offset int, reverse, include bool) ([]link, string, chapter, error) {
|
||||||
|
var links []link
|
||||||
|
var pathMax string
|
||||||
|
|
||||||
|
parser := gofeed.NewParser()
|
||||||
|
feed, err := parser.ParseURL(url.String())
|
||||||
|
|
||||||
|
if err == nil {
|
||||||
|
// RSS feed
|
||||||
|
|
||||||
|
for _, item := range feed.Items {
|
||||||
|
links = append(links, NewLink(item.Link, item.Title))
|
||||||
|
}
|
||||||
|
|
||||||
|
pathMax = "RSS"
|
||||||
|
} else {
|
||||||
|
// HTML website
|
||||||
|
|
||||||
selectorSet := true
|
selectorSet := true
|
||||||
if len(selector) == 0 {
|
if len(selector) == 0 {
|
||||||
selector = "a"
|
selector = "a"
|
||||||
@@ -383,7 +401,7 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, reverse, incl
|
|||||||
|
|
||||||
pathLinks := map[string][]link{}
|
pathLinks := map[string][]link{}
|
||||||
pathCount := map[string]int{}
|
pathCount := map[string]int{}
|
||||||
pathMax := ""
|
pathMax = ""
|
||||||
|
|
||||||
// visit and count link classes
|
// visit and count link classes
|
||||||
c := colly.NewCollector()
|
c := colly.NewCollector()
|
||||||
@@ -424,7 +442,9 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, reverse, incl
|
|||||||
})
|
})
|
||||||
c.Visit(url.String())
|
c.Visit(url.String())
|
||||||
|
|
||||||
links := pathLinks[pathMax]
|
links = pathLinks[pathMax]
|
||||||
|
}
|
||||||
|
|
||||||
if len(links) == 0 {
|
if len(links) == 0 {
|
||||||
return []link{}, pathMax, chapter{}, fmt.Errorf("no link found for selector: %s", selector)
|
return []link{}, pathMax, chapter{}, fmt.Errorf("no link found for selector: %s", selector)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -128,6 +128,22 @@ func TestSubChapters(t *testing.T) {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestSubChaptersRSS(t *testing.T) {
|
||||||
|
|
||||||
|
config0 := NewScrapeConfig()
|
||||||
|
config1 := NewScrapeConfig()
|
||||||
|
|
||||||
|
c := NewChapterFromURL("https://blog.lapw.at/rss", "", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {})
|
||||||
|
|
||||||
|
got := len(c.SubChapters())
|
||||||
|
want := 8
|
||||||
|
|
||||||
|
if got != want {
|
||||||
|
t.Errorf("got %v, wanted %v", got, want)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
func TestSubChaptersSelector(t *testing.T) {
|
func TestSubChaptersSelector(t *testing.T) {
|
||||||
|
|
||||||
config0 := NewScrapeConfig()
|
config0 := NewScrapeConfig()
|
||||||
|
|||||||
@@ -14,6 +14,6 @@ var versionCmd = &cobra.Command{
|
|||||||
Use: "version",
|
Use: "version",
|
||||||
Short: "Print the version number of papeer",
|
Short: "Print the version number of papeer",
|
||||||
Run: func(cmd *cobra.Command, args []string) {
|
Run: func(cmd *cobra.Command, args []string) {
|
||||||
fmt.Println("papeer v0.4.2")
|
fmt.Println("papeer v0.5.0")
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|||||||
5
go.mod
5
go.mod
@@ -29,10 +29,15 @@ require (
|
|||||||
github.com/golang/protobuf v1.5.2 // indirect
|
github.com/golang/protobuf v1.5.2 // indirect
|
||||||
github.com/inconshreveable/mousetrap v1.0.0 // indirect
|
github.com/inconshreveable/mousetrap v1.0.0 // indirect
|
||||||
github.com/jedib0t/go-pretty/v6 v6.2.4 // indirect
|
github.com/jedib0t/go-pretty/v6 v6.2.4 // indirect
|
||||||
|
github.com/json-iterator/go v1.1.11 // indirect
|
||||||
github.com/kennygrant/sanitize v1.2.4 // indirect
|
github.com/kennygrant/sanitize v1.2.4 // indirect
|
||||||
github.com/mattn/go-isatty v0.0.14 // indirect
|
github.com/mattn/go-isatty v0.0.14 // indirect
|
||||||
github.com/mattn/go-runewidth v0.0.13 // indirect
|
github.com/mattn/go-runewidth v0.0.13 // indirect
|
||||||
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect
|
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect
|
||||||
|
github.com/mmcdole/gofeed v1.1.3 // indirect
|
||||||
|
github.com/mmcdole/goxpp v0.0.0-20181012175147-0068e33feabf // indirect
|
||||||
|
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421 // indirect
|
||||||
|
github.com/modern-go/reflect2 v1.0.1 // indirect
|
||||||
github.com/rivo/uniseg v0.2.0 // indirect
|
github.com/rivo/uniseg v0.2.0 // indirect
|
||||||
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect
|
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect
|
||||||
github.com/schollz/progressbar/v3 v3.8.3 // indirect
|
github.com/schollz/progressbar/v3 v3.8.3 // indirect
|
||||||
|
|||||||
10
go.sum
10
go.sum
@@ -87,6 +87,7 @@ github.com/coreos/go-semver v0.3.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3Ee
|
|||||||
github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4=
|
github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4=
|
||||||
github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
|
github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
|
||||||
github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA=
|
github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA=
|
||||||
|
github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
|
||||||
github.com/cpuguy83/go-md2man/v2 v2.0.0/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
|
github.com/cpuguy83/go-md2man/v2 v2.0.0/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
|
||||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||||
@@ -235,6 +236,8 @@ github.com/jawher/mow.cli v1.1.0/go.mod h1:aNaQlc7ozF3vw6IJ2dHjp2ZFiA4ozMIYY6Pyu
|
|||||||
github.com/jedib0t/go-pretty/v6 v6.2.4 h1:wdaj2KHD2W+mz8JgJ/Q6L/T5dB7kyqEFI16eLq7GEmk=
|
github.com/jedib0t/go-pretty/v6 v6.2.4 h1:wdaj2KHD2W+mz8JgJ/Q6L/T5dB7kyqEFI16eLq7GEmk=
|
||||||
github.com/jedib0t/go-pretty/v6 v6.2.4/go.mod h1:+nE9fyyHGil+PuISTCrp7avEdo6bqoMwqZnuiK2r2a0=
|
github.com/jedib0t/go-pretty/v6 v6.2.4/go.mod h1:+nE9fyyHGil+PuISTCrp7avEdo6bqoMwqZnuiK2r2a0=
|
||||||
github.com/jonboulle/clockwork v0.1.0/go.mod h1:Ii8DK3G1RaLaWxj9trq07+26W01tbo22gdxWY5EU2bo=
|
github.com/jonboulle/clockwork v0.1.0/go.mod h1:Ii8DK3G1RaLaWxj9trq07+26W01tbo22gdxWY5EU2bo=
|
||||||
|
github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
|
||||||
|
github.com/json-iterator/go v1.1.11 h1:uVUAXhF2To8cbw/3xN3pxj6kk7TYKs98NIrTqPlMWAQ=
|
||||||
github.com/json-iterator/go v1.1.11/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
|
github.com/json-iterator/go v1.1.11/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
|
||||||
github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU=
|
github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU=
|
||||||
github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk=
|
github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk=
|
||||||
@@ -275,8 +278,14 @@ github.com/mitchellh/iochan v1.0.0/go.mod h1:JwYml1nuB7xOzsp52dPpHFffvOCDupsG0Qu
|
|||||||
github.com/mitchellh/mapstructure v0.0.0-20160808181253-ca63d7c062ee/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y=
|
github.com/mitchellh/mapstructure v0.0.0-20160808181253-ca63d7c062ee/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y=
|
||||||
github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y=
|
github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y=
|
||||||
github.com/mitchellh/mapstructure v1.4.1/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo=
|
github.com/mitchellh/mapstructure v1.4.1/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo=
|
||||||
|
github.com/mmcdole/gofeed v1.1.3 h1:pdrvMb18jMSLidGp8j0pLvc9IGziX4vbmvVqmLH6z8o=
|
||||||
|
github.com/mmcdole/gofeed v1.1.3/go.mod h1:QQO3maftbOu+hiVOGOZDRLymqGQCos4zxbA4j89gMrE=
|
||||||
|
github.com/mmcdole/goxpp v0.0.0-20181012175147-0068e33feabf h1:sWGE2v+hO0Nd4yFU/S/mDBM5plIU8v/Qhfz41hkDIAI=
|
||||||
|
github.com/mmcdole/goxpp v0.0.0-20181012175147-0068e33feabf/go.mod h1:pasqhqstspkosTneA62Nc+2p9SOBBYAPbnmRRWPQ0V8=
|
||||||
|
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421 h1:ZqeYNhU3OHLH3mGKHDcjJRFFRrJa6eAM5H+CtDdOsPc=
|
||||||
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
|
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
|
||||||
github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
|
github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
|
||||||
|
github.com/modern-go/reflect2 v1.0.1 h1:9f412s+6RmYXLWZSEzVVgPGK7C2PphHj5RJrvfx9AWI=
|
||||||
github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
|
github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
|
||||||
github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
|
github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
|
||||||
github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U=
|
github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U=
|
||||||
@@ -355,6 +364,7 @@ github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fx
|
|||||||
github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
|
github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
|
||||||
github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U=
|
github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U=
|
||||||
github.com/ugorji/go v1.1.4/go.mod h1:uQMGLiO92mf5W77hV/PUCpI3pbzQx3CRekS0kk+RGrc=
|
github.com/ugorji/go v1.1.4/go.mod h1:uQMGLiO92mf5W77hV/PUCpI3pbzQx3CRekS0kk+RGrc=
|
||||||
|
github.com/urfave/cli v1.22.3/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0=
|
||||||
github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU=
|
github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU=
|
||||||
github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q=
|
github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q=
|
||||||
github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
|
github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
|
||||||
|
|||||||
Reference in New Issue
Block a user