diff --git a/README.md b/README.md index e428a94..22ad238 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,19 @@ +``` +❯ papeer get --format epub --recursive --delay 500 --limit 10 https://news.ycombinator.com/ + 6s [===============================================>--------------------] 70% Status: 7 out of 10 chapters + 0s [====================================================================] 100% 1. Three ex-US intelligence officers admit hacking for UAE + 0s [====================================================================] 100% 2. Show HN: Time Travel Debugger + 0s [====================================================================] 100% 3. How much faster is Java 17? + 0s [====================================================================] 100% 4. The First Webcam Was Invented to Keep an Eye on a Coffee Pot + 0s [====================================================================] 100% 5. Nikon's 2021 Photomicrography Competition Winners + 0s [====================================================================] 100% 6. HTTP Status 418 – I'm a teapot + 0s [====================================================================] 100% 7. H3: Hexagonal hierarchical geospatial indexing system + --- [--------------------------------------------------------------------] 0% 8. Automatic cipher suite ordering in Go’s crypto/tls + --- [--------------------------------------------------------------------] 0% 9. Find engineering roles at over 800 YC-funded startups + --- [--------------------------------------------------------------------] 0% 10. Futarchy: Robin Hanson on prediction markets +Ebook saved to "Hacker_News.epub" +``` + # Installation ## From binary @@ -5,18 +21,25 @@ ```sh curl https://github.com/lapwat/papeer/releases/download/v0.0.1/papeer-v0.0.1 > papeer chmod +x papeer -mv papeer /usr/local/bin +sudo mv papeer /usr/local/bin +``` + +```sh +# (optional) install kindlegen to export ebooks to MOBI format +curl https://github.com/lapwat/papeer/raw/master/bin/kindlegen_linux_2.6_i386_v2_9.tar.gz > kindlegen +chmod +x kindlegen +sudo mv kindlegen /usr/local/bin ``` ## From source ```sh -go install github.com/lapwat/papeer +go get -u github.com/lapwat/papeer ``` # Usage -```txt +``` Browse the web in the eink era Usage: diff --git a/book/scraper.go b/book/scraper.go index 872f098..b43a8e4 100644 --- a/book/scraper.go +++ b/book/scraper.go @@ -3,6 +3,7 @@ package book import ( "fmt" "log" + "math" urllib "net/url" "strings" "sync" @@ -17,11 +18,11 @@ type scraper struct { url string } -func NewBookFromURL(url, selector string, include bool, delay int) Book { +func NewBookFromURL(url, selector string, include bool, limit, delay int) Book { home := NewChapterFromURL(url) b := New(home.Name(), home.Author()) - chapters := tableOfContent(url, selector, delay) + chapters := tableOfContent(url, selector, limit, delay) if include { b.AddChapter(home) } @@ -44,56 +45,45 @@ func NewChapterFromURL(url string) chapter { return chapter{article.Title, article.Byline, article.Content} } -func tableOfContent(url, selector string, delay int) []chapter { +func tableOfContent(url, selector string, limit, delay int) []chapter { c := colly.NewCollector() classesLinks := map[string][]map[string]string{} classesCount := map[string]int{} classMax := "" + selectorSet := true if selector == "" { - c.OnHTML("a", func(e *colly.HTMLElement) { - href := e.Attr("href") - text := strings.TrimSpace(e.Text) - class := e.Attr("class") - - //if class != "" && text != "" { - classesLinks[class] = append(classesLinks[class], map[string]string{ - "href": href, - "text": text, - }) - - classesCount[class]++ - - if classesCount[class] > classesCount[classMax] { - classMax = class - } - //} - - }) - } else { - c.OnHTML(selector, func(e *colly.HTMLElement) { - href := e.Attr("href") - text := strings.TrimSpace(e.Text) - class := e.Attr("class") - - //if class != "" && text != "" { - classesLinks[class] = append(classesLinks[class], map[string]string{ - "href": href, - "text": text, - }) - - classesCount[class]++ - - if classesCount[class] > classesCount[classMax] { - classMax = class - } - //} - }) + selector = "a" + selectorSet = false } + + c.OnHTML(selector, func(e *colly.HTMLElement) { + href := e.Attr("href") + text := strings.TrimSpace(e.Text) + class := e.Attr("class") + + if selectorSet || class != "" && text != "" { + classesLinks[class] = append(classesLinks[class], map[string]string{ + "href": href, + "text": text, + }) + + classesCount[class]++ + + if classesCount[class] > classesCount[classMax] { + classMax = class + } + } + + }) c.Visit(url) - fmt.Println(classesCount) + links := classesLinks[classMax] + if limit != -1 { + limit = int(math.Min(float64(limit), float64(len(links)))) + links = links[:limit] + } chapters := make([]chapter, len(links)) base, err := urllib.Parse(url) diff --git a/cmd/get.go b/cmd/get.go index 2c19140..039c9d9 100644 --- a/cmd/get.go +++ b/cmd/get.go @@ -17,7 +17,7 @@ import ( var quiet, stdout, recursive, include bool var format, output, selector string -var delay int +var limit, delay int var getCmd = &cobra.Command{ Use: "get", @@ -48,10 +48,22 @@ var getCmd = &cobra.Command{ } } - if include && recursive == false { + if cmd.Flags().Changed("selector") && recursive == false { + return errors.New("cannot use selector option if not in recursive mode") + } + + if cmd.Flags().Changed("include") && recursive == false { return errors.New("cannot use include option if not in recursive mode") } + if cmd.Flags().Changed("limit") && recursive == false { + return errors.New("cannot use limit option if not in recursive mode") + } + + if cmd.Flags().Changed("delay") && recursive == false { + return errors.New("cannot use delay option if not in recursive mode") + } + return nil }, Run: func(cmd *cobra.Command, args []string) { @@ -59,7 +71,7 @@ var getCmd = &cobra.Command{ var b book.Book if recursive { - b = book.NewBookFromURL(url, selector, include, delay) + b = book.NewBookFromURL(url, selector, include, limit, delay) } else { c := book.NewChapterFromURL(url) b = book.New(c.Name(), c.Author()) @@ -104,7 +116,9 @@ var getCmd = &cobra.Command{ } } - fmt.Printf("Markdown saved to \"%s\"\n", output) + if stdout == false { + fmt.Printf("Markdown saved to \"%s\"\n", output) + } } if format == "epub" { diff --git a/cmd/list.go b/cmd/list.go index 71082ea..bbe98a2 100644 --- a/cmd/list.go +++ b/cmd/list.go @@ -3,6 +3,8 @@ package cmd import ( "errors" "fmt" + "log" + urllib "net/url" "strings" colly "github.com/gocolly/colly/v2" @@ -19,20 +21,27 @@ var listCmd = &cobra.Command{ return nil }, Run: func(cmd *cobra.Command, args []string) { - url := args[0] - c := colly.NewCollector() + base, err := urllib.Parse(args[0]) + if err != nil { + log.Fatal(err) + } + + if selector == "" { + selector = "a" + } // visit and count link classes classesLinks := map[string][]map[string]string{} classesCount := map[string]int{} classMax := "" + c := colly.NewCollector() c.OnHTML(selector, func(e *colly.HTMLElement) { href := e.Attr("href") text := strings.TrimSpace(e.Text) class := e.Attr("class") - // if class != "" && text != "" { + if cmd.Flags().Changed("selector") || class != "" && text != "" { classesLinks[class] = append(classesLinks[class], map[string]string{ "href": href, "text": text, @@ -43,12 +52,17 @@ var listCmd = &cobra.Command{ if classesCount[class] > classesCount[classMax] { classMax = class } - // } + } }) - c.Visit(url) + c.Visit(base.String()) for index, link := range classesLinks[classMax] { - fmt.Printf("Chapter %d: %s %s\n", index+1, link["text"], link["href"]) + u, err := base.Parse(link["href"]) + if err != nil { + log.Fatal(err) + } + + fmt.Printf("Chapter %d: %s %s\n", index+1, link["text"], u.String()) } }, diff --git a/cmd/root.go b/cmd/root.go index 97aa552..8fc1392 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -28,9 +28,10 @@ func init() { rootCmd.PersistentFlags().StringVarP(&selector, "selector", "s", "", "table of content CSS selector") rootCmd.PersistentFlags().BoolVarP(&recursive, "recursive", "r", false, "create one chapter per natigation item") rootCmd.PersistentFlags().BoolVarP(&include, "include", "i", false, "include URL as first chapter, in resursive mode") - rootCmd.PersistentFlags().BoolVarP(&quiet, "quiet", "q", false, "do not show progress bars") + rootCmd.PersistentFlags().BoolVarP(&quiet, "quiet", "q", false, "do not show logs") rootCmd.PersistentFlags().BoolVarP(&stdout, "stdout", "", false, "print to standard output") - rootCmd.PersistentFlags().IntVarP(&delay, "delay", "d", -1, "wait before downloading next chapter, in milliseconds") + rootCmd.PersistentFlags().IntVarP(&limit, "limit", "l", -1, "limit number of chapters, in recursive mode") + rootCmd.PersistentFlags().IntVarP(&delay, "delay", "d", -1, "time to wait before downloading next chapter, in milliseconds") rootCmd.AddCommand(getCmd) rootCmd.AddCommand(listCmd)