8 Commits

Author SHA1 Message Date
lapwat
079122f4a0 docs 2021-09-15 20:51:16 +02:00
lapwat
9ee5f43748 docs 2021-09-15 20:47:53 +02:00
lapwat
e3833fdddd docs 2021-09-15 20:47:22 +02:00
lapwat
a7d14e7372 bump version 2021-09-15 20:26:50 +02:00
lapwat
ac1fb3dd51 limit option, docs 2021-09-15 20:25:23 +02:00
lapwat
bb1df3b6a3 update readme 2021-09-14 23:28:22 +02:00
lapwat
83adc22e2c update readme 2021-09-14 23:27:26 +02:00
lapwat
73441aa7fb update readme 2021-09-14 23:18:32 +02:00
6 changed files with 127 additions and 58 deletions

View File

@@ -1,12 +1,49 @@
```
papeer get --format epub --recursive --delay 500 --limit 10 https://news.ycombinator.com/
6s [===============================================>--------------------] 70% Status: 7 out of 10 chapters
0s [====================================================================] 100% 1. Three ex-US intelligence officers admit hacking for UAE
0s [====================================================================] 100% 2. Show HN: Time Travel Debugger
0s [====================================================================] 100% 3. How much faster is Java 17?
0s [====================================================================] 100% 4. The First Webcam Was Invented to Keep an Eye on a Coffee Pot
0s [====================================================================] 100% 5. Nikon's 2021 Photomicrography Competition Winners
0s [====================================================================] 100% 6. HTTP Status 418 I'm a teapot
0s [====================================================================] 100% 7. H3: Hexagonal hierarchical geospatial indexing system
--- [--------------------------------------------------------------------] 0% 8. Automatic cipher suite ordering in Gos crypto/tls
--- [--------------------------------------------------------------------] 0% 9. Find engineering roles at over 800 YC-funded startups
--- [--------------------------------------------------------------------] 0% 10. Futarchy: Robin Hanson on prediction markets
Ebook saved to "Hacker_News.epub"
```
# Installation # Installation
## From source
```sh ```sh
go install github.com/lapwat/papeer go get -u github.com/lapwat/papeer
```
## From binary
```sh
curl -L https://github.com/lapwat/papeer/releases/download/v0.0.2/papeer-v0.0.2 > papeer
chmod +x papeer
sudo mv papeer /usr/local/bin
```
## Install kindlegen to export websites to MOBI (optional)
```sh
TMPDIR=$(mktemp -d -t papeer-XXXXX)
curl -L https://github.com/lapwat/papeer/releases/download/kindlegen/kindlegen_linux_2.6_i386_v2_9.tar.gz > $TMPDIR/kindlegen.tar.gz
tar xzvf $TMPDIR/kindlegen.tar.gz -C $TMPDIR
chmod +x $TMPDIR/kindlegen
sudo mv $TMPDIR/kindlegen /usr/local/bin
rm $TMPDIR
``` ```
# Usage # Usage
```sh ```
Browse the web in the eink era Browse the web in the eink era
Usage: Usage:
@@ -39,5 +76,18 @@ Use "papeer [command] --help" for more information about a command.
Execute this command in your current shell, or add it to your `.bashrc`. Execute this command in your current shell, or add it to your `.bashrc`.
```sh ```sh
. <(papeer completion [bash|fish|powershell|zsh]) . <(papeer completion bash)
``` ```
Type `papeer completion bash -h` for more information.
You can replace `bash` by your own shell (zsh, fish or powershell).
# Dependencies
- `cobra` command line interface
- `go-readability` extract content from HTML
- `html-to-markdown` convert HTML to Markdown
- `go-epub` convert HTML to EPUB
- `colly` query HTML trees
- `uiprogress` display progress bars

View File

@@ -3,6 +3,7 @@ package book
import ( import (
"fmt" "fmt"
"log" "log"
"math"
urllib "net/url" urllib "net/url"
"strings" "strings"
"sync" "sync"
@@ -17,11 +18,11 @@ type scraper struct {
url string url string
} }
func NewBookFromURL(url, selector string, include bool, delay int) Book { func NewBookFromURL(url, selector string, include bool, limit, delay int) Book {
home := NewChapterFromURL(url) home := NewChapterFromURL(url)
b := New(home.Name(), home.Author()) b := New(home.Name(), home.Author())
chapters := tableOfContent(url, selector, delay) chapters := tableOfContent(url, selector, limit, delay)
if include { if include {
b.AddChapter(home) b.AddChapter(home)
} }
@@ -44,40 +45,25 @@ func NewChapterFromURL(url string) chapter {
return chapter{article.Title, article.Byline, article.Content} return chapter{article.Title, article.Byline, article.Content}
} }
func tableOfContent(url, selector string, delay int) []chapter { func tableOfContent(url, selector string, limit, delay int) []chapter {
c := colly.NewCollector() c := colly.NewCollector()
classesLinks := map[string][]map[string]string{} classesLinks := map[string][]map[string]string{}
classesCount := map[string]int{} classesCount := map[string]int{}
classMax := "" classMax := ""
selectorSet := true
if selector == "" { if selector == "" {
c.OnHTML("a", func(e *colly.HTMLElement) { selector = "a"
href := e.Attr("href") selectorSet = false
text := strings.TrimSpace(e.Text)
class := e.Attr("class")
//if class != "" && text != "" {
classesLinks[class] = append(classesLinks[class], map[string]string{
"href": href,
"text": text,
})
classesCount[class]++
if classesCount[class] > classesCount[classMax] {
classMax = class
} }
//}
})
} else {
c.OnHTML(selector, func(e *colly.HTMLElement) { c.OnHTML(selector, func(e *colly.HTMLElement) {
href := e.Attr("href") href := e.Attr("href")
text := strings.TrimSpace(e.Text) text := strings.TrimSpace(e.Text)
class := e.Attr("class") class := e.Attr("class")
//if class != "" && text != "" { if selectorSet || class != "" && text != "" {
classesLinks[class] = append(classesLinks[class], map[string]string{ classesLinks[class] = append(classesLinks[class], map[string]string{
"href": href, "href": href,
"text": text, "text": text,
@@ -88,12 +74,16 @@ func tableOfContent(url, selector string, delay int) []chapter {
if classesCount[class] > classesCount[classMax] { if classesCount[class] > classesCount[classMax] {
classMax = class classMax = class
} }
//}
})
} }
})
c.Visit(url) c.Visit(url)
fmt.Println(classesCount)
links := classesLinks[classMax] links := classesLinks[classMax]
if limit != -1 {
limit = int(math.Min(float64(limit), float64(len(links))))
links = links[:limit]
}
chapters := make([]chapter, len(links)) chapters := make([]chapter, len(links))
base, err := urllib.Parse(url) base, err := urllib.Parse(url)

View File

@@ -17,7 +17,7 @@ import (
var quiet, stdout, recursive, include bool var quiet, stdout, recursive, include bool
var format, output, selector string var format, output, selector string
var delay int var limit, delay int
var getCmd = &cobra.Command{ var getCmd = &cobra.Command{
Use: "get", Use: "get",
@@ -48,10 +48,22 @@ var getCmd = &cobra.Command{
} }
} }
if include && recursive == false { if cmd.Flags().Changed("selector") && recursive == false {
return errors.New("cannot use selector option if not in recursive mode")
}
if cmd.Flags().Changed("include") && recursive == false {
return errors.New("cannot use include option if not in recursive mode") return errors.New("cannot use include option if not in recursive mode")
} }
if cmd.Flags().Changed("limit") && recursive == false {
return errors.New("cannot use limit option if not in recursive mode")
}
if cmd.Flags().Changed("delay") && recursive == false {
return errors.New("cannot use delay option if not in recursive mode")
}
return nil return nil
}, },
Run: func(cmd *cobra.Command, args []string) { Run: func(cmd *cobra.Command, args []string) {
@@ -59,7 +71,7 @@ var getCmd = &cobra.Command{
var b book.Book var b book.Book
if recursive { if recursive {
b = book.NewBookFromURL(url, selector, include, delay) b = book.NewBookFromURL(url, selector, include, limit, delay)
} else { } else {
c := book.NewChapterFromURL(url) c := book.NewChapterFromURL(url)
b = book.New(c.Name(), c.Author()) b = book.New(c.Name(), c.Author())
@@ -104,8 +116,10 @@ var getCmd = &cobra.Command{
} }
} }
if stdout == false {
fmt.Printf("Markdown saved to \"%s\"\n", output) fmt.Printf("Markdown saved to \"%s\"\n", output)
} }
}
if format == "epub" { if format == "epub" {
e := epub.NewEpub(b.Name()) e := epub.NewEpub(b.Name())

View File

@@ -3,6 +3,8 @@ package cmd
import ( import (
"errors" "errors"
"fmt" "fmt"
"log"
urllib "net/url"
"strings" "strings"
colly "github.com/gocolly/colly/v2" colly "github.com/gocolly/colly/v2"
@@ -19,20 +21,27 @@ var listCmd = &cobra.Command{
return nil return nil
}, },
Run: func(cmd *cobra.Command, args []string) { Run: func(cmd *cobra.Command, args []string) {
url := args[0] base, err := urllib.Parse(args[0])
c := colly.NewCollector() if err != nil {
log.Fatal(err)
}
if selector == "" {
selector = "a"
}
// visit and count link classes // visit and count link classes
classesLinks := map[string][]map[string]string{} classesLinks := map[string][]map[string]string{}
classesCount := map[string]int{} classesCount := map[string]int{}
classMax := "" classMax := ""
c := colly.NewCollector()
c.OnHTML(selector, func(e *colly.HTMLElement) { c.OnHTML(selector, func(e *colly.HTMLElement) {
href := e.Attr("href") href := e.Attr("href")
text := strings.TrimSpace(e.Text) text := strings.TrimSpace(e.Text)
class := e.Attr("class") class := e.Attr("class")
// if class != "" && text != "" { if cmd.Flags().Changed("selector") || class != "" && text != "" {
classesLinks[class] = append(classesLinks[class], map[string]string{ classesLinks[class] = append(classesLinks[class], map[string]string{
"href": href, "href": href,
"text": text, "text": text,
@@ -43,12 +52,17 @@ var listCmd = &cobra.Command{
if classesCount[class] > classesCount[classMax] { if classesCount[class] > classesCount[classMax] {
classMax = class classMax = class
} }
// } }
}) })
c.Visit(url) c.Visit(base.String())
for index, link := range classesLinks[classMax] { for index, link := range classesLinks[classMax] {
fmt.Printf("Chapter %d: %s %s\n", index+1, link["text"], link["href"]) u, err := base.Parse(link["href"])
if err != nil {
log.Fatal(err)
}
fmt.Printf("Chapter %d: %s %s\n", index+1, link["text"], u.String())
} }
}, },

View File

@@ -28,9 +28,10 @@ func init() {
rootCmd.PersistentFlags().StringVarP(&selector, "selector", "s", "", "table of content CSS selector") rootCmd.PersistentFlags().StringVarP(&selector, "selector", "s", "", "table of content CSS selector")
rootCmd.PersistentFlags().BoolVarP(&recursive, "recursive", "r", false, "create one chapter per natigation item") rootCmd.PersistentFlags().BoolVarP(&recursive, "recursive", "r", false, "create one chapter per natigation item")
rootCmd.PersistentFlags().BoolVarP(&include, "include", "i", false, "include URL as first chapter, in resursive mode") rootCmd.PersistentFlags().BoolVarP(&include, "include", "i", false, "include URL as first chapter, in resursive mode")
rootCmd.PersistentFlags().BoolVarP(&quiet, "quiet", "q", false, "do not show progress bars") rootCmd.PersistentFlags().BoolVarP(&quiet, "quiet", "q", false, "do not show logs")
rootCmd.PersistentFlags().BoolVarP(&stdout, "stdout", "", false, "print to standard output") rootCmd.PersistentFlags().BoolVarP(&stdout, "stdout", "", false, "print to standard output")
rootCmd.PersistentFlags().IntVarP(&delay, "delay", "d", -1, "wait before downloading next chapter, in milliseconds") rootCmd.PersistentFlags().IntVarP(&limit, "limit", "l", -1, "limit number of chapters, in recursive mode")
rootCmd.PersistentFlags().IntVarP(&delay, "delay", "d", -1, "time to wait before downloading next chapter, in milliseconds")
rootCmd.AddCommand(getCmd) rootCmd.AddCommand(getCmd)
rootCmd.AddCommand(listCmd) rootCmd.AddCommand(listCmd)

View File

@@ -14,6 +14,6 @@ var versionCmd = &cobra.Command{
Use: "version", Use: "version",
Short: "Print the version number of papeer", Short: "Print the version number of papeer",
Run: func(cmd *cobra.Command, args []string) { Run: func(cmd *cobra.Command, args []string) {
fmt.Println("papeer v0.0.1") fmt.Println("papeer v0.0.2")
}, },
} }