mirror of
https://github.com/NohamR/papeer.git
synced 2026-05-25 12:27:20 +00:00
Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
079122f4a0 | ||
|
|
9ee5f43748 | ||
|
|
e3833fdddd | ||
|
|
a7d14e7372 | ||
|
|
ac1fb3dd51 | ||
|
|
bb1df3b6a3 | ||
|
|
83adc22e2c | ||
|
|
73441aa7fb |
56
README.md
56
README.md
@@ -1,12 +1,49 @@
|
|||||||
|
```
|
||||||
|
❯ papeer get --format epub --recursive --delay 500 --limit 10 https://news.ycombinator.com/
|
||||||
|
6s [===============================================>--------------------] 70% Status: 7 out of 10 chapters
|
||||||
|
0s [====================================================================] 100% 1. Three ex-US intelligence officers admit hacking for UAE
|
||||||
|
0s [====================================================================] 100% 2. Show HN: Time Travel Debugger
|
||||||
|
0s [====================================================================] 100% 3. How much faster is Java 17?
|
||||||
|
0s [====================================================================] 100% 4. The First Webcam Was Invented to Keep an Eye on a Coffee Pot
|
||||||
|
0s [====================================================================] 100% 5. Nikon's 2021 Photomicrography Competition Winners
|
||||||
|
0s [====================================================================] 100% 6. HTTP Status 418 – I'm a teapot
|
||||||
|
0s [====================================================================] 100% 7. H3: Hexagonal hierarchical geospatial indexing system
|
||||||
|
--- [--------------------------------------------------------------------] 0% 8. Automatic cipher suite ordering in Go’s crypto/tls
|
||||||
|
--- [--------------------------------------------------------------------] 0% 9. Find engineering roles at over 800 YC-funded startups
|
||||||
|
--- [--------------------------------------------------------------------] 0% 10. Futarchy: Robin Hanson on prediction markets
|
||||||
|
Ebook saved to "Hacker_News.epub"
|
||||||
|
```
|
||||||
|
|
||||||
# Installation
|
# Installation
|
||||||
|
|
||||||
|
## From source
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
go install github.com/lapwat/papeer
|
go get -u github.com/lapwat/papeer
|
||||||
|
```
|
||||||
|
|
||||||
|
## From binary
|
||||||
|
|
||||||
|
```sh
|
||||||
|
curl -L https://github.com/lapwat/papeer/releases/download/v0.0.2/papeer-v0.0.2 > papeer
|
||||||
|
chmod +x papeer
|
||||||
|
sudo mv papeer /usr/local/bin
|
||||||
|
```
|
||||||
|
|
||||||
|
## Install kindlegen to export websites to MOBI (optional)
|
||||||
|
|
||||||
|
```sh
|
||||||
|
TMPDIR=$(mktemp -d -t papeer-XXXXX)
|
||||||
|
curl -L https://github.com/lapwat/papeer/releases/download/kindlegen/kindlegen_linux_2.6_i386_v2_9.tar.gz > $TMPDIR/kindlegen.tar.gz
|
||||||
|
tar xzvf $TMPDIR/kindlegen.tar.gz -C $TMPDIR
|
||||||
|
chmod +x $TMPDIR/kindlegen
|
||||||
|
sudo mv $TMPDIR/kindlegen /usr/local/bin
|
||||||
|
rm $TMPDIR
|
||||||
```
|
```
|
||||||
|
|
||||||
# Usage
|
# Usage
|
||||||
|
|
||||||
```sh
|
```
|
||||||
Browse the web in the eink era
|
Browse the web in the eink era
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
@@ -39,5 +76,18 @@ Use "papeer [command] --help" for more information about a command.
|
|||||||
Execute this command in your current shell, or add it to your `.bashrc`.
|
Execute this command in your current shell, or add it to your `.bashrc`.
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
. <(papeer completion [bash|fish|powershell|zsh])
|
. <(papeer completion bash)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Type `papeer completion bash -h` for more information.
|
||||||
|
|
||||||
|
You can replace `bash` by your own shell (zsh, fish or powershell).
|
||||||
|
|
||||||
|
# Dependencies
|
||||||
|
|
||||||
|
- `cobra` command line interface
|
||||||
|
- `go-readability` extract content from HTML
|
||||||
|
- `html-to-markdown` convert HTML to Markdown
|
||||||
|
- `go-epub` convert HTML to EPUB
|
||||||
|
- `colly` query HTML trees
|
||||||
|
- `uiprogress` display progress bars
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ package book
|
|||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"log"
|
"log"
|
||||||
|
"math"
|
||||||
urllib "net/url"
|
urllib "net/url"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
@@ -17,11 +18,11 @@ type scraper struct {
|
|||||||
url string
|
url string
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewBookFromURL(url, selector string, include bool, delay int) Book {
|
func NewBookFromURL(url, selector string, include bool, limit, delay int) Book {
|
||||||
home := NewChapterFromURL(url)
|
home := NewChapterFromURL(url)
|
||||||
b := New(home.Name(), home.Author())
|
b := New(home.Name(), home.Author())
|
||||||
|
|
||||||
chapters := tableOfContent(url, selector, delay)
|
chapters := tableOfContent(url, selector, limit, delay)
|
||||||
if include {
|
if include {
|
||||||
b.AddChapter(home)
|
b.AddChapter(home)
|
||||||
}
|
}
|
||||||
@@ -44,40 +45,25 @@ func NewChapterFromURL(url string) chapter {
|
|||||||
return chapter{article.Title, article.Byline, article.Content}
|
return chapter{article.Title, article.Byline, article.Content}
|
||||||
}
|
}
|
||||||
|
|
||||||
func tableOfContent(url, selector string, delay int) []chapter {
|
func tableOfContent(url, selector string, limit, delay int) []chapter {
|
||||||
c := colly.NewCollector()
|
c := colly.NewCollector()
|
||||||
|
|
||||||
classesLinks := map[string][]map[string]string{}
|
classesLinks := map[string][]map[string]string{}
|
||||||
classesCount := map[string]int{}
|
classesCount := map[string]int{}
|
||||||
classMax := ""
|
classMax := ""
|
||||||
|
|
||||||
|
selectorSet := true
|
||||||
if selector == "" {
|
if selector == "" {
|
||||||
c.OnHTML("a", func(e *colly.HTMLElement) {
|
selector = "a"
|
||||||
href := e.Attr("href")
|
selectorSet = false
|
||||||
text := strings.TrimSpace(e.Text)
|
|
||||||
class := e.Attr("class")
|
|
||||||
|
|
||||||
//if class != "" && text != "" {
|
|
||||||
classesLinks[class] = append(classesLinks[class], map[string]string{
|
|
||||||
"href": href,
|
|
||||||
"text": text,
|
|
||||||
})
|
|
||||||
|
|
||||||
classesCount[class]++
|
|
||||||
|
|
||||||
if classesCount[class] > classesCount[classMax] {
|
|
||||||
classMax = class
|
|
||||||
}
|
}
|
||||||
//}
|
|
||||||
|
|
||||||
})
|
|
||||||
} else {
|
|
||||||
c.OnHTML(selector, func(e *colly.HTMLElement) {
|
c.OnHTML(selector, func(e *colly.HTMLElement) {
|
||||||
href := e.Attr("href")
|
href := e.Attr("href")
|
||||||
text := strings.TrimSpace(e.Text)
|
text := strings.TrimSpace(e.Text)
|
||||||
class := e.Attr("class")
|
class := e.Attr("class")
|
||||||
|
|
||||||
//if class != "" && text != "" {
|
if selectorSet || class != "" && text != "" {
|
||||||
classesLinks[class] = append(classesLinks[class], map[string]string{
|
classesLinks[class] = append(classesLinks[class], map[string]string{
|
||||||
"href": href,
|
"href": href,
|
||||||
"text": text,
|
"text": text,
|
||||||
@@ -88,12 +74,16 @@ func tableOfContent(url, selector string, delay int) []chapter {
|
|||||||
if classesCount[class] > classesCount[classMax] {
|
if classesCount[class] > classesCount[classMax] {
|
||||||
classMax = class
|
classMax = class
|
||||||
}
|
}
|
||||||
//}
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
})
|
||||||
c.Visit(url)
|
c.Visit(url)
|
||||||
fmt.Println(classesCount)
|
|
||||||
links := classesLinks[classMax]
|
links := classesLinks[classMax]
|
||||||
|
if limit != -1 {
|
||||||
|
limit = int(math.Min(float64(limit), float64(len(links))))
|
||||||
|
links = links[:limit]
|
||||||
|
}
|
||||||
|
|
||||||
chapters := make([]chapter, len(links))
|
chapters := make([]chapter, len(links))
|
||||||
base, err := urllib.Parse(url)
|
base, err := urllib.Parse(url)
|
||||||
|
|||||||
20
cmd/get.go
20
cmd/get.go
@@ -17,7 +17,7 @@ import (
|
|||||||
|
|
||||||
var quiet, stdout, recursive, include bool
|
var quiet, stdout, recursive, include bool
|
||||||
var format, output, selector string
|
var format, output, selector string
|
||||||
var delay int
|
var limit, delay int
|
||||||
|
|
||||||
var getCmd = &cobra.Command{
|
var getCmd = &cobra.Command{
|
||||||
Use: "get",
|
Use: "get",
|
||||||
@@ -48,10 +48,22 @@ var getCmd = &cobra.Command{
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if include && recursive == false {
|
if cmd.Flags().Changed("selector") && recursive == false {
|
||||||
|
return errors.New("cannot use selector option if not in recursive mode")
|
||||||
|
}
|
||||||
|
|
||||||
|
if cmd.Flags().Changed("include") && recursive == false {
|
||||||
return errors.New("cannot use include option if not in recursive mode")
|
return errors.New("cannot use include option if not in recursive mode")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if cmd.Flags().Changed("limit") && recursive == false {
|
||||||
|
return errors.New("cannot use limit option if not in recursive mode")
|
||||||
|
}
|
||||||
|
|
||||||
|
if cmd.Flags().Changed("delay") && recursive == false {
|
||||||
|
return errors.New("cannot use delay option if not in recursive mode")
|
||||||
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
},
|
},
|
||||||
Run: func(cmd *cobra.Command, args []string) {
|
Run: func(cmd *cobra.Command, args []string) {
|
||||||
@@ -59,7 +71,7 @@ var getCmd = &cobra.Command{
|
|||||||
var b book.Book
|
var b book.Book
|
||||||
|
|
||||||
if recursive {
|
if recursive {
|
||||||
b = book.NewBookFromURL(url, selector, include, delay)
|
b = book.NewBookFromURL(url, selector, include, limit, delay)
|
||||||
} else {
|
} else {
|
||||||
c := book.NewChapterFromURL(url)
|
c := book.NewChapterFromURL(url)
|
||||||
b = book.New(c.Name(), c.Author())
|
b = book.New(c.Name(), c.Author())
|
||||||
@@ -104,8 +116,10 @@ var getCmd = &cobra.Command{
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if stdout == false {
|
||||||
fmt.Printf("Markdown saved to \"%s\"\n", output)
|
fmt.Printf("Markdown saved to \"%s\"\n", output)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if format == "epub" {
|
if format == "epub" {
|
||||||
e := epub.NewEpub(b.Name())
|
e := epub.NewEpub(b.Name())
|
||||||
|
|||||||
26
cmd/list.go
26
cmd/list.go
@@ -3,6 +3,8 @@ package cmd
|
|||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"log"
|
||||||
|
urllib "net/url"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
colly "github.com/gocolly/colly/v2"
|
colly "github.com/gocolly/colly/v2"
|
||||||
@@ -19,20 +21,27 @@ var listCmd = &cobra.Command{
|
|||||||
return nil
|
return nil
|
||||||
},
|
},
|
||||||
Run: func(cmd *cobra.Command, args []string) {
|
Run: func(cmd *cobra.Command, args []string) {
|
||||||
url := args[0]
|
base, err := urllib.Parse(args[0])
|
||||||
c := colly.NewCollector()
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if selector == "" {
|
||||||
|
selector = "a"
|
||||||
|
}
|
||||||
|
|
||||||
// visit and count link classes
|
// visit and count link classes
|
||||||
classesLinks := map[string][]map[string]string{}
|
classesLinks := map[string][]map[string]string{}
|
||||||
classesCount := map[string]int{}
|
classesCount := map[string]int{}
|
||||||
classMax := ""
|
classMax := ""
|
||||||
|
|
||||||
|
c := colly.NewCollector()
|
||||||
c.OnHTML(selector, func(e *colly.HTMLElement) {
|
c.OnHTML(selector, func(e *colly.HTMLElement) {
|
||||||
href := e.Attr("href")
|
href := e.Attr("href")
|
||||||
text := strings.TrimSpace(e.Text)
|
text := strings.TrimSpace(e.Text)
|
||||||
class := e.Attr("class")
|
class := e.Attr("class")
|
||||||
|
|
||||||
// if class != "" && text != "" {
|
if cmd.Flags().Changed("selector") || class != "" && text != "" {
|
||||||
classesLinks[class] = append(classesLinks[class], map[string]string{
|
classesLinks[class] = append(classesLinks[class], map[string]string{
|
||||||
"href": href,
|
"href": href,
|
||||||
"text": text,
|
"text": text,
|
||||||
@@ -43,12 +52,17 @@ var listCmd = &cobra.Command{
|
|||||||
if classesCount[class] > classesCount[classMax] {
|
if classesCount[class] > classesCount[classMax] {
|
||||||
classMax = class
|
classMax = class
|
||||||
}
|
}
|
||||||
// }
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
c.Visit(url)
|
c.Visit(base.String())
|
||||||
for index, link := range classesLinks[classMax] {
|
for index, link := range classesLinks[classMax] {
|
||||||
fmt.Printf("Chapter %d: %s %s\n", index+1, link["text"], link["href"])
|
u, err := base.Parse(link["href"])
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Printf("Chapter %d: %s %s\n", index+1, link["text"], u.String())
|
||||||
}
|
}
|
||||||
|
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -28,9 +28,10 @@ func init() {
|
|||||||
rootCmd.PersistentFlags().StringVarP(&selector, "selector", "s", "", "table of content CSS selector")
|
rootCmd.PersistentFlags().StringVarP(&selector, "selector", "s", "", "table of content CSS selector")
|
||||||
rootCmd.PersistentFlags().BoolVarP(&recursive, "recursive", "r", false, "create one chapter per natigation item")
|
rootCmd.PersistentFlags().BoolVarP(&recursive, "recursive", "r", false, "create one chapter per natigation item")
|
||||||
rootCmd.PersistentFlags().BoolVarP(&include, "include", "i", false, "include URL as first chapter, in resursive mode")
|
rootCmd.PersistentFlags().BoolVarP(&include, "include", "i", false, "include URL as first chapter, in resursive mode")
|
||||||
rootCmd.PersistentFlags().BoolVarP(&quiet, "quiet", "q", false, "do not show progress bars")
|
rootCmd.PersistentFlags().BoolVarP(&quiet, "quiet", "q", false, "do not show logs")
|
||||||
rootCmd.PersistentFlags().BoolVarP(&stdout, "stdout", "", false, "print to standard output")
|
rootCmd.PersistentFlags().BoolVarP(&stdout, "stdout", "", false, "print to standard output")
|
||||||
rootCmd.PersistentFlags().IntVarP(&delay, "delay", "d", -1, "wait before downloading next chapter, in milliseconds")
|
rootCmd.PersistentFlags().IntVarP(&limit, "limit", "l", -1, "limit number of chapters, in recursive mode")
|
||||||
|
rootCmd.PersistentFlags().IntVarP(&delay, "delay", "d", -1, "time to wait before downloading next chapter, in milliseconds")
|
||||||
|
|
||||||
rootCmd.AddCommand(getCmd)
|
rootCmd.AddCommand(getCmd)
|
||||||
rootCmd.AddCommand(listCmd)
|
rootCmd.AddCommand(listCmd)
|
||||||
|
|||||||
@@ -14,6 +14,6 @@ var versionCmd = &cobra.Command{
|
|||||||
Use: "version",
|
Use: "version",
|
||||||
Short: "Print the version number of papeer",
|
Short: "Print the version number of papeer",
|
||||||
Run: func(cmd *cobra.Command, args []string) {
|
Run: func(cmd *cobra.Command, args []string) {
|
||||||
fmt.Println("papeer v0.0.1")
|
fmt.Println("papeer v0.0.2")
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user