limit option, docs

This commit is contained in:
lapwat
2021-09-15 20:25:23 +02:00
parent bb1df3b6a3
commit ac1fb3dd51
5 changed files with 99 additions and 57 deletions

View File

@@ -1,3 +1,19 @@
```
papeer get --format epub --recursive --delay 500 --limit 10 https://news.ycombinator.com/
6s [===============================================>--------------------] 70% Status: 7 out of 10 chapters
0s [====================================================================] 100% 1. Three ex-US intelligence officers admit hacking for UAE
0s [====================================================================] 100% 2. Show HN: Time Travel Debugger
0s [====================================================================] 100% 3. How much faster is Java 17?
0s [====================================================================] 100% 4. The First Webcam Was Invented to Keep an Eye on a Coffee Pot
0s [====================================================================] 100% 5. Nikon's 2021 Photomicrography Competition Winners
0s [====================================================================] 100% 6. HTTP Status 418 I'm a teapot
0s [====================================================================] 100% 7. H3: Hexagonal hierarchical geospatial indexing system
--- [--------------------------------------------------------------------] 0% 8. Automatic cipher suite ordering in Gos crypto/tls
--- [--------------------------------------------------------------------] 0% 9. Find engineering roles at over 800 YC-funded startups
--- [--------------------------------------------------------------------] 0% 10. Futarchy: Robin Hanson on prediction markets
Ebook saved to "Hacker_News.epub"
```
# Installation # Installation
## From binary ## From binary
@@ -5,18 +21,25 @@
```sh ```sh
curl https://github.com/lapwat/papeer/releases/download/v0.0.1/papeer-v0.0.1 > papeer curl https://github.com/lapwat/papeer/releases/download/v0.0.1/papeer-v0.0.1 > papeer
chmod +x papeer chmod +x papeer
mv papeer /usr/local/bin sudo mv papeer /usr/local/bin
```
```sh
# (optional) install kindlegen to export ebooks to MOBI format
curl https://github.com/lapwat/papeer/raw/master/bin/kindlegen_linux_2.6_i386_v2_9.tar.gz > kindlegen
chmod +x kindlegen
sudo mv kindlegen /usr/local/bin
``` ```
## From source ## From source
```sh ```sh
go install github.com/lapwat/papeer go get -u github.com/lapwat/papeer
``` ```
# Usage # Usage
```txt ```
Browse the web in the eink era Browse the web in the eink era
Usage: Usage:

View File

@@ -3,6 +3,7 @@ package book
import ( import (
"fmt" "fmt"
"log" "log"
"math"
urllib "net/url" urllib "net/url"
"strings" "strings"
"sync" "sync"
@@ -17,11 +18,11 @@ type scraper struct {
url string url string
} }
func NewBookFromURL(url, selector string, include bool, delay int) Book { func NewBookFromURL(url, selector string, include bool, limit, delay int) Book {
home := NewChapterFromURL(url) home := NewChapterFromURL(url)
b := New(home.Name(), home.Author()) b := New(home.Name(), home.Author())
chapters := tableOfContent(url, selector, delay) chapters := tableOfContent(url, selector, limit, delay)
if include { if include {
b.AddChapter(home) b.AddChapter(home)
} }
@@ -44,56 +45,45 @@ func NewChapterFromURL(url string) chapter {
return chapter{article.Title, article.Byline, article.Content} return chapter{article.Title, article.Byline, article.Content}
} }
func tableOfContent(url, selector string, delay int) []chapter { func tableOfContent(url, selector string, limit, delay int) []chapter {
c := colly.NewCollector() c := colly.NewCollector()
classesLinks := map[string][]map[string]string{} classesLinks := map[string][]map[string]string{}
classesCount := map[string]int{} classesCount := map[string]int{}
classMax := "" classMax := ""
selectorSet := true
if selector == "" { if selector == "" {
c.OnHTML("a", func(e *colly.HTMLElement) { selector = "a"
href := e.Attr("href") selectorSet = false
text := strings.TrimSpace(e.Text)
class := e.Attr("class")
//if class != "" && text != "" {
classesLinks[class] = append(classesLinks[class], map[string]string{
"href": href,
"text": text,
})
classesCount[class]++
if classesCount[class] > classesCount[classMax] {
classMax = class
}
//}
})
} else {
c.OnHTML(selector, func(e *colly.HTMLElement) {
href := e.Attr("href")
text := strings.TrimSpace(e.Text)
class := e.Attr("class")
//if class != "" && text != "" {
classesLinks[class] = append(classesLinks[class], map[string]string{
"href": href,
"text": text,
})
classesCount[class]++
if classesCount[class] > classesCount[classMax] {
classMax = class
}
//}
})
} }
c.OnHTML(selector, func(e *colly.HTMLElement) {
href := e.Attr("href")
text := strings.TrimSpace(e.Text)
class := e.Attr("class")
if selectorSet || class != "" && text != "" {
classesLinks[class] = append(classesLinks[class], map[string]string{
"href": href,
"text": text,
})
classesCount[class]++
if classesCount[class] > classesCount[classMax] {
classMax = class
}
}
})
c.Visit(url) c.Visit(url)
fmt.Println(classesCount)
links := classesLinks[classMax] links := classesLinks[classMax]
if limit != -1 {
limit = int(math.Min(float64(limit), float64(len(links))))
links = links[:limit]
}
chapters := make([]chapter, len(links)) chapters := make([]chapter, len(links))
base, err := urllib.Parse(url) base, err := urllib.Parse(url)

View File

@@ -17,7 +17,7 @@ import (
var quiet, stdout, recursive, include bool var quiet, stdout, recursive, include bool
var format, output, selector string var format, output, selector string
var delay int var limit, delay int
var getCmd = &cobra.Command{ var getCmd = &cobra.Command{
Use: "get", Use: "get",
@@ -48,10 +48,22 @@ var getCmd = &cobra.Command{
} }
} }
if include && recursive == false { if cmd.Flags().Changed("selector") && recursive == false {
return errors.New("cannot use selector option if not in recursive mode")
}
if cmd.Flags().Changed("include") && recursive == false {
return errors.New("cannot use include option if not in recursive mode") return errors.New("cannot use include option if not in recursive mode")
} }
if cmd.Flags().Changed("limit") && recursive == false {
return errors.New("cannot use limit option if not in recursive mode")
}
if cmd.Flags().Changed("delay") && recursive == false {
return errors.New("cannot use delay option if not in recursive mode")
}
return nil return nil
}, },
Run: func(cmd *cobra.Command, args []string) { Run: func(cmd *cobra.Command, args []string) {
@@ -59,7 +71,7 @@ var getCmd = &cobra.Command{
var b book.Book var b book.Book
if recursive { if recursive {
b = book.NewBookFromURL(url, selector, include, delay) b = book.NewBookFromURL(url, selector, include, limit, delay)
} else { } else {
c := book.NewChapterFromURL(url) c := book.NewChapterFromURL(url)
b = book.New(c.Name(), c.Author()) b = book.New(c.Name(), c.Author())
@@ -104,7 +116,9 @@ var getCmd = &cobra.Command{
} }
} }
fmt.Printf("Markdown saved to \"%s\"\n", output) if stdout == false {
fmt.Printf("Markdown saved to \"%s\"\n", output)
}
} }
if format == "epub" { if format == "epub" {

View File

@@ -3,6 +3,8 @@ package cmd
import ( import (
"errors" "errors"
"fmt" "fmt"
"log"
urllib "net/url"
"strings" "strings"
colly "github.com/gocolly/colly/v2" colly "github.com/gocolly/colly/v2"
@@ -19,20 +21,27 @@ var listCmd = &cobra.Command{
return nil return nil
}, },
Run: func(cmd *cobra.Command, args []string) { Run: func(cmd *cobra.Command, args []string) {
url := args[0] base, err := urllib.Parse(args[0])
c := colly.NewCollector() if err != nil {
log.Fatal(err)
}
if selector == "" {
selector = "a"
}
// visit and count link classes // visit and count link classes
classesLinks := map[string][]map[string]string{} classesLinks := map[string][]map[string]string{}
classesCount := map[string]int{} classesCount := map[string]int{}
classMax := "" classMax := ""
c := colly.NewCollector()
c.OnHTML(selector, func(e *colly.HTMLElement) { c.OnHTML(selector, func(e *colly.HTMLElement) {
href := e.Attr("href") href := e.Attr("href")
text := strings.TrimSpace(e.Text) text := strings.TrimSpace(e.Text)
class := e.Attr("class") class := e.Attr("class")
// if class != "" && text != "" { if cmd.Flags().Changed("selector") || class != "" && text != "" {
classesLinks[class] = append(classesLinks[class], map[string]string{ classesLinks[class] = append(classesLinks[class], map[string]string{
"href": href, "href": href,
"text": text, "text": text,
@@ -43,12 +52,17 @@ var listCmd = &cobra.Command{
if classesCount[class] > classesCount[classMax] { if classesCount[class] > classesCount[classMax] {
classMax = class classMax = class
} }
// } }
}) })
c.Visit(url) c.Visit(base.String())
for index, link := range classesLinks[classMax] { for index, link := range classesLinks[classMax] {
fmt.Printf("Chapter %d: %s %s\n", index+1, link["text"], link["href"]) u, err := base.Parse(link["href"])
if err != nil {
log.Fatal(err)
}
fmt.Printf("Chapter %d: %s %s\n", index+1, link["text"], u.String())
} }
}, },

View File

@@ -28,9 +28,10 @@ func init() {
rootCmd.PersistentFlags().StringVarP(&selector, "selector", "s", "", "table of content CSS selector") rootCmd.PersistentFlags().StringVarP(&selector, "selector", "s", "", "table of content CSS selector")
rootCmd.PersistentFlags().BoolVarP(&recursive, "recursive", "r", false, "create one chapter per natigation item") rootCmd.PersistentFlags().BoolVarP(&recursive, "recursive", "r", false, "create one chapter per natigation item")
rootCmd.PersistentFlags().BoolVarP(&include, "include", "i", false, "include URL as first chapter, in resursive mode") rootCmd.PersistentFlags().BoolVarP(&include, "include", "i", false, "include URL as first chapter, in resursive mode")
rootCmd.PersistentFlags().BoolVarP(&quiet, "quiet", "q", false, "do not show progress bars") rootCmd.PersistentFlags().BoolVarP(&quiet, "quiet", "q", false, "do not show logs")
rootCmd.PersistentFlags().BoolVarP(&stdout, "stdout", "", false, "print to standard output") rootCmd.PersistentFlags().BoolVarP(&stdout, "stdout", "", false, "print to standard output")
rootCmd.PersistentFlags().IntVarP(&delay, "delay", "d", -1, "wait before downloading next chapter, in milliseconds") rootCmd.PersistentFlags().IntVarP(&limit, "limit", "l", -1, "limit number of chapters, in recursive mode")
rootCmd.PersistentFlags().IntVarP(&delay, "delay", "d", -1, "time to wait before downloading next chapter, in milliseconds")
rootCmd.AddCommand(getCmd) rootCmd.AddCommand(getCmd)
rootCmd.AddCommand(listCmd) rootCmd.AddCommand(listCmd)