5 Commits

Author SHA1 Message Date
lapwat
a7d14e7372 bump version 2021-09-15 20:26:50 +02:00
lapwat
ac1fb3dd51 limit option, docs 2021-09-15 20:25:23 +02:00
lapwat
bb1df3b6a3 update readme 2021-09-14 23:28:22 +02:00
lapwat
83adc22e2c update readme 2021-09-14 23:27:26 +02:00
lapwat
73441aa7fb update readme 2021-09-14 23:18:32 +02:00
6 changed files with 123 additions and 58 deletions

View File

@@ -1,12 +1,45 @@
```
papeer get --format epub --recursive --delay 500 --limit 10 https://news.ycombinator.com/
6s [===============================================>--------------------] 70% Status: 7 out of 10 chapters
0s [====================================================================] 100% 1. Three ex-US intelligence officers admit hacking for UAE
0s [====================================================================] 100% 2. Show HN: Time Travel Debugger
0s [====================================================================] 100% 3. How much faster is Java 17?
0s [====================================================================] 100% 4. The First Webcam Was Invented to Keep an Eye on a Coffee Pot
0s [====================================================================] 100% 5. Nikon's 2021 Photomicrography Competition Winners
0s [====================================================================] 100% 6. HTTP Status 418 I'm a teapot
0s [====================================================================] 100% 7. H3: Hexagonal hierarchical geospatial indexing system
--- [--------------------------------------------------------------------] 0% 8. Automatic cipher suite ordering in Gos crypto/tls
--- [--------------------------------------------------------------------] 0% 9. Find engineering roles at over 800 YC-funded startups
--- [--------------------------------------------------------------------] 0% 10. Futarchy: Robin Hanson on prediction markets
Ebook saved to "Hacker_News.epub"
```
# Installation
## From binary
```sh
go install github.com/lapwat/papeer
curl https://github.com/lapwat/papeer/releases/download/v0.0.1/papeer-v0.0.1 > papeer
chmod +x papeer
sudo mv papeer /usr/local/bin
```
```sh
# (optional) install kindlegen to export ebooks to MOBI format
curl https://github.com/lapwat/papeer/raw/master/bin/kindlegen_linux_2.6_i386_v2_9.tar.gz > kindlegen
chmod +x kindlegen
sudo mv kindlegen /usr/local/bin
```
## From source
```sh
go get -u github.com/lapwat/papeer
```
# Usage
```sh
```
Browse the web in the eink era
Usage:
@@ -39,5 +72,18 @@ Use "papeer [command] --help" for more information about a command.
Execute this command in your current shell, or add it to your `.bashrc`.
```sh
. <(papeer completion [bash|fish|powershell|zsh])
. <(papeer completion bash)
```
Type `papeer completion bash -h` for more information.
You can replace `bash` by your own shell (zsh, fish or powershell).
# Dependencies
- `cobra` command line interface
- `go-readability` extract content from HTML
- `html-to-markdown` convert HTML to Markdown
- `go-epub` convert HTML to EPUB
- `colly` query HTML trees
- `uiprogress` display progress bars

View File

@@ -3,6 +3,7 @@ package book
import (
"fmt"
"log"
"math"
urllib "net/url"
"strings"
"sync"
@@ -17,11 +18,11 @@ type scraper struct {
url string
}
func NewBookFromURL(url, selector string, include bool, delay int) Book {
func NewBookFromURL(url, selector string, include bool, limit, delay int) Book {
home := NewChapterFromURL(url)
b := New(home.Name(), home.Author())
chapters := tableOfContent(url, selector, delay)
chapters := tableOfContent(url, selector, limit, delay)
if include {
b.AddChapter(home)
}
@@ -44,56 +45,45 @@ func NewChapterFromURL(url string) chapter {
return chapter{article.Title, article.Byline, article.Content}
}
func tableOfContent(url, selector string, delay int) []chapter {
func tableOfContent(url, selector string, limit, delay int) []chapter {
c := colly.NewCollector()
classesLinks := map[string][]map[string]string{}
classesCount := map[string]int{}
classMax := ""
selectorSet := true
if selector == "" {
c.OnHTML("a", func(e *colly.HTMLElement) {
href := e.Attr("href")
text := strings.TrimSpace(e.Text)
class := e.Attr("class")
//if class != "" && text != "" {
classesLinks[class] = append(classesLinks[class], map[string]string{
"href": href,
"text": text,
})
classesCount[class]++
if classesCount[class] > classesCount[classMax] {
classMax = class
}
//}
})
} else {
c.OnHTML(selector, func(e *colly.HTMLElement) {
href := e.Attr("href")
text := strings.TrimSpace(e.Text)
class := e.Attr("class")
//if class != "" && text != "" {
classesLinks[class] = append(classesLinks[class], map[string]string{
"href": href,
"text": text,
})
classesCount[class]++
if classesCount[class] > classesCount[classMax] {
classMax = class
}
//}
})
selector = "a"
selectorSet = false
}
c.OnHTML(selector, func(e *colly.HTMLElement) {
href := e.Attr("href")
text := strings.TrimSpace(e.Text)
class := e.Attr("class")
if selectorSet || class != "" && text != "" {
classesLinks[class] = append(classesLinks[class], map[string]string{
"href": href,
"text": text,
})
classesCount[class]++
if classesCount[class] > classesCount[classMax] {
classMax = class
}
}
})
c.Visit(url)
fmt.Println(classesCount)
links := classesLinks[classMax]
if limit != -1 {
limit = int(math.Min(float64(limit), float64(len(links))))
links = links[:limit]
}
chapters := make([]chapter, len(links))
base, err := urllib.Parse(url)

View File

@@ -17,7 +17,7 @@ import (
var quiet, stdout, recursive, include bool
var format, output, selector string
var delay int
var limit, delay int
var getCmd = &cobra.Command{
Use: "get",
@@ -48,10 +48,22 @@ var getCmd = &cobra.Command{
}
}
if include && recursive == false {
if cmd.Flags().Changed("selector") && recursive == false {
return errors.New("cannot use selector option if not in recursive mode")
}
if cmd.Flags().Changed("include") && recursive == false {
return errors.New("cannot use include option if not in recursive mode")
}
if cmd.Flags().Changed("limit") && recursive == false {
return errors.New("cannot use limit option if not in recursive mode")
}
if cmd.Flags().Changed("delay") && recursive == false {
return errors.New("cannot use delay option if not in recursive mode")
}
return nil
},
Run: func(cmd *cobra.Command, args []string) {
@@ -59,7 +71,7 @@ var getCmd = &cobra.Command{
var b book.Book
if recursive {
b = book.NewBookFromURL(url, selector, include, delay)
b = book.NewBookFromURL(url, selector, include, limit, delay)
} else {
c := book.NewChapterFromURL(url)
b = book.New(c.Name(), c.Author())
@@ -104,7 +116,9 @@ var getCmd = &cobra.Command{
}
}
fmt.Printf("Markdown saved to \"%s\"\n", output)
if stdout == false {
fmt.Printf("Markdown saved to \"%s\"\n", output)
}
}
if format == "epub" {

View File

@@ -3,6 +3,8 @@ package cmd
import (
"errors"
"fmt"
"log"
urllib "net/url"
"strings"
colly "github.com/gocolly/colly/v2"
@@ -19,20 +21,27 @@ var listCmd = &cobra.Command{
return nil
},
Run: func(cmd *cobra.Command, args []string) {
url := args[0]
c := colly.NewCollector()
base, err := urllib.Parse(args[0])
if err != nil {
log.Fatal(err)
}
if selector == "" {
selector = "a"
}
// visit and count link classes
classesLinks := map[string][]map[string]string{}
classesCount := map[string]int{}
classMax := ""
c := colly.NewCollector()
c.OnHTML(selector, func(e *colly.HTMLElement) {
href := e.Attr("href")
text := strings.TrimSpace(e.Text)
class := e.Attr("class")
// if class != "" && text != "" {
if cmd.Flags().Changed("selector") || class != "" && text != "" {
classesLinks[class] = append(classesLinks[class], map[string]string{
"href": href,
"text": text,
@@ -43,12 +52,17 @@ var listCmd = &cobra.Command{
if classesCount[class] > classesCount[classMax] {
classMax = class
}
// }
}
})
c.Visit(url)
c.Visit(base.String())
for index, link := range classesLinks[classMax] {
fmt.Printf("Chapter %d: %s %s\n", index+1, link["text"], link["href"])
u, err := base.Parse(link["href"])
if err != nil {
log.Fatal(err)
}
fmt.Printf("Chapter %d: %s %s\n", index+1, link["text"], u.String())
}
},

View File

@@ -28,9 +28,10 @@ func init() {
rootCmd.PersistentFlags().StringVarP(&selector, "selector", "s", "", "table of content CSS selector")
rootCmd.PersistentFlags().BoolVarP(&recursive, "recursive", "r", false, "create one chapter per natigation item")
rootCmd.PersistentFlags().BoolVarP(&include, "include", "i", false, "include URL as first chapter, in resursive mode")
rootCmd.PersistentFlags().BoolVarP(&quiet, "quiet", "q", false, "do not show progress bars")
rootCmd.PersistentFlags().BoolVarP(&quiet, "quiet", "q", false, "do not show logs")
rootCmd.PersistentFlags().BoolVarP(&stdout, "stdout", "", false, "print to standard output")
rootCmd.PersistentFlags().IntVarP(&delay, "delay", "d", -1, "wait before downloading next chapter, in milliseconds")
rootCmd.PersistentFlags().IntVarP(&limit, "limit", "l", -1, "limit number of chapters, in recursive mode")
rootCmd.PersistentFlags().IntVarP(&delay, "delay", "d", -1, "time to wait before downloading next chapter, in milliseconds")
rootCmd.AddCommand(getCmd)
rootCmd.AddCommand(listCmd)

View File

@@ -14,6 +14,6 @@ var versionCmd = &cobra.Command{
Use: "version",
Short: "Print the version number of papeer",
Run: func(cmd *cobra.Command, args []string) {
fmt.Println("papeer v0.0.1")
fmt.Println("papeer v0.0.2")
},
}