mirror of
https://github.com/NohamR/papeer.git
synced 2026-05-25 20:00:47 +00:00
Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
91d126a667 | ||
|
|
4d6171544d | ||
|
|
53d0dd1360 | ||
|
|
13c9138d01 |
15
.github/workflows/format.yml
vendored
Normal file
15
.github/workflows/format.yml
vendored
Normal file
@@ -0,0 +1,15 @@
|
||||
name: Format
|
||||
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- '**.go'
|
||||
|
||||
jobs:
|
||||
format:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
- name: Check formatting
|
||||
run: if [[ -n "$(gofmt -l .)" ]]; then exit 1; fi
|
||||
25
.github/workflows/test.yml
vendored
Normal file
25
.github/workflows/test.yml
vendored
Normal file
@@ -0,0 +1,25 @@
|
||||
name: Test
|
||||
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- '**.go'
|
||||
|
||||
jobs:
|
||||
test:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Set up Go
|
||||
uses: actions/setup-go@v2
|
||||
with:
|
||||
go-version: 1.17
|
||||
- name: Install kindlegen
|
||||
run: |
|
||||
curl -L https://github.com/lapwat/papeer/releases/download/kindlegen/kindlegen_linux_2.6_i386_v2_9.tar.gz > kindlegen.tar.gz
|
||||
tar xzvf kindlegen.tar.gz
|
||||
chmod +x kindlegen
|
||||
mv kindlegen /usr/local/bin
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
- name: Test
|
||||
run: make test
|
||||
@@ -24,7 +24,7 @@ Papeer is a powerful **ereader internet vacuum**. It can scrape any website, rem
|
||||
|
||||
## Scrape a web page
|
||||
|
||||
The `get` command lets you retrieve the content of any web page.
|
||||
The `get` command lets you retrieve the content of any web page or RSS feed.
|
||||
|
||||
```
|
||||
Scrape URL content
|
||||
@@ -138,7 +138,7 @@ go get -u github.com/lapwat/papeer
|
||||
```sh
|
||||
# use platform=darwin for MacOS
|
||||
platform=linux
|
||||
release=0.4.2
|
||||
release=0.5.2
|
||||
|
||||
# download and extract
|
||||
curl -L https://github.com/lapwat/papeer/releases/download/v$release/papeer-v$release-$platform-amd64.tar.gz > papeer.tar.gz
|
||||
@@ -151,7 +151,7 @@ sudo mv papeer /usr/local/bin
|
||||
|
||||
### Windows
|
||||
|
||||
Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.4.2/papeer-v0.4.2-windows-amd64.exe.zip).
|
||||
Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.5.2/papeer-v0.5.2-windows-amd64.exe.zip).
|
||||
|
||||
## MOBI support
|
||||
|
||||
|
||||
116
book/scraper.go
116
book/scraper.go
@@ -15,6 +15,7 @@ import (
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
readability "github.com/go-shiori/go-readability"
|
||||
colly "github.com/gocolly/colly/v2"
|
||||
"github.com/mmcdole/gofeed"
|
||||
)
|
||||
|
||||
type ScrapeConfig struct {
|
||||
@@ -375,56 +376,75 @@ func GetPath(elm *goquery.Selection) string {
|
||||
}
|
||||
|
||||
func GetLinks(url *urllib.URL, selector string, limit, offset int, reverse, include bool) ([]link, string, chapter, error) {
|
||||
selectorSet := true
|
||||
if len(selector) == 0 {
|
||||
selector = "a"
|
||||
selectorSet = false
|
||||
var links []link
|
||||
var pathMax string
|
||||
|
||||
parser := gofeed.NewParser()
|
||||
feed, err := parser.ParseURL(url.String())
|
||||
|
||||
if err == nil {
|
||||
// RSS feed
|
||||
|
||||
for _, item := range feed.Items {
|
||||
links = append(links, NewLink(item.Link, item.Title))
|
||||
}
|
||||
|
||||
pathMax = "RSS"
|
||||
} else {
|
||||
// HTML website
|
||||
|
||||
selectorSet := true
|
||||
if len(selector) == 0 {
|
||||
selector = "a"
|
||||
selectorSet = false
|
||||
}
|
||||
|
||||
pathLinks := map[string][]link{}
|
||||
pathCount := map[string]int{}
|
||||
pathMax = ""
|
||||
|
||||
// visit and count link classes
|
||||
c := colly.NewCollector()
|
||||
c.OnHTML(selector, func(e *colly.HTMLElement) {
|
||||
href := e.Attr("href")
|
||||
text := strings.TrimSpace(e.Text)
|
||||
path := GetPath(e.DOM)
|
||||
key := path
|
||||
|
||||
if selectorSet {
|
||||
|
||||
// if selector is set, we use the selector specified by the user
|
||||
|
||||
key = selector
|
||||
pathLinks[key] = append(pathLinks[key], NewLink(href, text))
|
||||
pathCount[key] += 1
|
||||
pathMax = key
|
||||
|
||||
} else {
|
||||
|
||||
// if selector is not set, we compute the selector ourselves
|
||||
|
||||
class := e.Attr("class")
|
||||
// include the element class to make sure we have the same exact path for every link in the table of content
|
||||
key = fmt.Sprintf("%s.%s", path, class)
|
||||
|
||||
// we count this key if the link text is not empty
|
||||
if text != "" {
|
||||
pathLinks[key] = append(pathLinks[key], NewLink(href, text))
|
||||
pathCount[key] += len(text)
|
||||
|
||||
if pathCount[key] > pathCount[pathMax] {
|
||||
pathMax = key
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
})
|
||||
c.Visit(url.String())
|
||||
|
||||
links = pathLinks[pathMax]
|
||||
}
|
||||
|
||||
pathLinks := map[string][]link{}
|
||||
pathCount := map[string]int{}
|
||||
pathMax := ""
|
||||
|
||||
// visit and count link classes
|
||||
c := colly.NewCollector()
|
||||
c.OnHTML(selector, func(e *colly.HTMLElement) {
|
||||
href := e.Attr("href")
|
||||
text := strings.TrimSpace(e.Text)
|
||||
path := GetPath(e.DOM)
|
||||
key := path
|
||||
|
||||
if selectorSet {
|
||||
|
||||
// if selector is set, we use the selector specified by the user
|
||||
|
||||
key = selector
|
||||
pathLinks[key] = append(pathLinks[key], NewLink(href, text))
|
||||
pathCount[key] += 1
|
||||
pathMax = key
|
||||
|
||||
} else {
|
||||
|
||||
// if selector is not set, we compute the selector ourselves
|
||||
|
||||
class := e.Attr("class")
|
||||
// include the element class to make sure we have the same exact path for every link in the table of content
|
||||
key = fmt.Sprintf("%s.%s", path, class)
|
||||
|
||||
// we count this key if the link text is not empty
|
||||
if text != "" {
|
||||
pathLinks[key] = append(pathLinks[key], NewLink(href, text))
|
||||
pathCount[key] += len(text)
|
||||
|
||||
if pathCount[key] > pathCount[pathMax] {
|
||||
pathMax = key
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
})
|
||||
c.Visit(url.String())
|
||||
|
||||
links := pathLinks[pathMax]
|
||||
if len(links) == 0 {
|
||||
return []link{}, pathMax, chapter{}, fmt.Errorf("no link found for selector: %s", selector)
|
||||
}
|
||||
|
||||
@@ -128,6 +128,22 @@ func TestSubChapters(t *testing.T) {
|
||||
|
||||
}
|
||||
|
||||
func TestSubChaptersRSS(t *testing.T) {
|
||||
|
||||
config0 := NewScrapeConfig()
|
||||
config1 := NewScrapeConfig()
|
||||
|
||||
c := NewChapterFromURL("https://blog.lapw.at/rss", "", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {})
|
||||
|
||||
got := len(c.SubChapters())
|
||||
want := 8
|
||||
|
||||
if got != want {
|
||||
t.Errorf("got %v, wanted %v", got, want)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func TestSubChaptersSelector(t *testing.T) {
|
||||
|
||||
config0 := NewScrapeConfig()
|
||||
|
||||
@@ -69,7 +69,7 @@ var listCmd = &cobra.Command{
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
links, path, _, err := book.GetLinks(base, listOpts.Selector[0], listOpts.limit, listOpts.offset, listOpts.reverse, listOpts.include)
|
||||
links, path, home, err := book.GetLinks(base, listOpts.Selector[0], listOpts.limit, listOpts.offset, listOpts.reverse, listOpts.include)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
@@ -80,6 +80,8 @@ var listCmd = &cobra.Command{
|
||||
t.Style().Options.SeparateColumns = false
|
||||
t.Style().Options.SeparateHeader = false
|
||||
|
||||
t.SetTitle(home.Name())
|
||||
|
||||
// format selector path
|
||||
pathArray := strings.Split(path, "<")
|
||||
// reverse path
|
||||
|
||||
@@ -14,6 +14,6 @@ var versionCmd = &cobra.Command{
|
||||
Use: "version",
|
||||
Short: "Print the version number of papeer",
|
||||
Run: func(cmd *cobra.Command, args []string) {
|
||||
fmt.Println("papeer v0.4.2")
|
||||
fmt.Println("papeer v0.5.2")
|
||||
},
|
||||
}
|
||||
|
||||
5
go.mod
5
go.mod
@@ -29,10 +29,15 @@ require (
|
||||
github.com/golang/protobuf v1.5.2 // indirect
|
||||
github.com/inconshreveable/mousetrap v1.0.0 // indirect
|
||||
github.com/jedib0t/go-pretty/v6 v6.2.4 // indirect
|
||||
github.com/json-iterator/go v1.1.11 // indirect
|
||||
github.com/kennygrant/sanitize v1.2.4 // indirect
|
||||
github.com/mattn/go-isatty v0.0.14 // indirect
|
||||
github.com/mattn/go-runewidth v0.0.13 // indirect
|
||||
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect
|
||||
github.com/mmcdole/gofeed v1.1.3 // indirect
|
||||
github.com/mmcdole/goxpp v0.0.0-20181012175147-0068e33feabf // indirect
|
||||
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421 // indirect
|
||||
github.com/modern-go/reflect2 v1.0.1 // indirect
|
||||
github.com/rivo/uniseg v0.2.0 // indirect
|
||||
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect
|
||||
github.com/schollz/progressbar/v3 v3.8.3 // indirect
|
||||
|
||||
10
go.sum
10
go.sum
@@ -87,6 +87,7 @@ github.com/coreos/go-semver v0.3.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3Ee
|
||||
github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4=
|
||||
github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
|
||||
github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA=
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.0/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
@@ -235,6 +236,8 @@ github.com/jawher/mow.cli v1.1.0/go.mod h1:aNaQlc7ozF3vw6IJ2dHjp2ZFiA4ozMIYY6Pyu
|
||||
github.com/jedib0t/go-pretty/v6 v6.2.4 h1:wdaj2KHD2W+mz8JgJ/Q6L/T5dB7kyqEFI16eLq7GEmk=
|
||||
github.com/jedib0t/go-pretty/v6 v6.2.4/go.mod h1:+nE9fyyHGil+PuISTCrp7avEdo6bqoMwqZnuiK2r2a0=
|
||||
github.com/jonboulle/clockwork v0.1.0/go.mod h1:Ii8DK3G1RaLaWxj9trq07+26W01tbo22gdxWY5EU2bo=
|
||||
github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
|
||||
github.com/json-iterator/go v1.1.11 h1:uVUAXhF2To8cbw/3xN3pxj6kk7TYKs98NIrTqPlMWAQ=
|
||||
github.com/json-iterator/go v1.1.11/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
|
||||
github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU=
|
||||
github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk=
|
||||
@@ -275,8 +278,14 @@ github.com/mitchellh/iochan v1.0.0/go.mod h1:JwYml1nuB7xOzsp52dPpHFffvOCDupsG0Qu
|
||||
github.com/mitchellh/mapstructure v0.0.0-20160808181253-ca63d7c062ee/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y=
|
||||
github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y=
|
||||
github.com/mitchellh/mapstructure v1.4.1/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo=
|
||||
github.com/mmcdole/gofeed v1.1.3 h1:pdrvMb18jMSLidGp8j0pLvc9IGziX4vbmvVqmLH6z8o=
|
||||
github.com/mmcdole/gofeed v1.1.3/go.mod h1:QQO3maftbOu+hiVOGOZDRLymqGQCos4zxbA4j89gMrE=
|
||||
github.com/mmcdole/goxpp v0.0.0-20181012175147-0068e33feabf h1:sWGE2v+hO0Nd4yFU/S/mDBM5plIU8v/Qhfz41hkDIAI=
|
||||
github.com/mmcdole/goxpp v0.0.0-20181012175147-0068e33feabf/go.mod h1:pasqhqstspkosTneA62Nc+2p9SOBBYAPbnmRRWPQ0V8=
|
||||
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421 h1:ZqeYNhU3OHLH3mGKHDcjJRFFRrJa6eAM5H+CtDdOsPc=
|
||||
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
|
||||
github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
|
||||
github.com/modern-go/reflect2 v1.0.1 h1:9f412s+6RmYXLWZSEzVVgPGK7C2PphHj5RJrvfx9AWI=
|
||||
github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
|
||||
github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
|
||||
github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U=
|
||||
@@ -355,6 +364,7 @@ github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fx
|
||||
github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
|
||||
github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U=
|
||||
github.com/ugorji/go v1.1.4/go.mod h1:uQMGLiO92mf5W77hV/PUCpI3pbzQx3CRekS0kk+RGrc=
|
||||
github.com/urfave/cli v1.22.3/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0=
|
||||
github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU=
|
||||
github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q=
|
||||
github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
|
||||
|
||||
13
release.sh
13
release.sh
@@ -14,15 +14,14 @@ do
|
||||
platform_split=(${platform//\// })
|
||||
GOOS=${platform_split[0]}
|
||||
GOARCH=${platform_split[1]}
|
||||
output_name=papeer
|
||||
|
||||
if [ $GOOS = "windows" ]; then
|
||||
env GOOS=$GOOS GOARCH=$GOARCH go build -o "$output_name.exe"
|
||||
zip "$output_name-v$version-$GOOS-$GOARCH.exe.zip" "$output_name.exe"
|
||||
rm "$output_name.exe"
|
||||
env GOOS=$GOOS GOARCH=$GOARCH go build -o papeer.exe
|
||||
zip "papeer-v$version-$GOOS-$GOARCH.exe.zip" papeer.exe
|
||||
rm papeer.exe
|
||||
else
|
||||
env GOOS=$GOOS GOARCH=$GOARCH go build -o "$output_name"
|
||||
tar czvf "$output_name-v$version-$GOOS-$GOARCH.tar.gz" "$output_name"
|
||||
rm "$output_name"
|
||||
env GOOS=$GOOS GOARCH=$GOARCH go build -o papeer
|
||||
tar czvf "papeer-v$version-$GOOS-$GOARCH.tar.gz" papeer
|
||||
rm papeer
|
||||
fi
|
||||
done
|
||||
|
||||
Reference in New Issue
Block a user