diff --git a/README.md b/README.md index 4825ce4..734998c 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,16 @@ ``` -❯ papeer get --format epub --recursive --delay 500 --limit 10 https://news.ycombinator.com/ - 6s [===============================================>--------------------] 70% Status: 7 out of 10 chapters - 0s [====================================================================] 100% 1. Three ex-US intelligence officers admit hacking for UAE - 0s [====================================================================] 100% 2. Show HN: Time Travel Debugger - 0s [====================================================================] 100% 3. How much faster is Java 17? - 0s [====================================================================] 100% 4. The First Webcam Was Invented to Keep an Eye on a Coffee Pot - 0s [====================================================================] 100% 5. Nikon's 2021 Photomicrography Competition Winners - 0s [====================================================================] 100% 6. HTTP Status 418 – I'm a teapot - 0s [====================================================================] 100% 7. H3: Hexagonal hierarchical geospatial indexing system - --- [--------------------------------------------------------------------] 0% 8. Automatic cipher suite ordering in Go’s crypto/tls - --- [--------------------------------------------------------------------] 0% 9. Find engineering roles at over 800 YC-funded startups - --- [--------------------------------------------------------------------] 0% 10. Futarchy: Robin Hanson on prediction markets +❯ papeer get --format=epub --recursive --delay=500 --limit=10 https://news.ycombinator.com/ +[===============================================>--------------------] Chapters 7 / 10 +[====================================================================] 1. Three ex-US intelligence officers admit hacking for UAE +[====================================================================] 2. Show HN: Time Travel Debugger +[====================================================================] 3. How much faster is Java 17? +[====================================================================] 4. The First Webcam Was Invented to Keep an Eye on a Coffee Pot +[====================================================================] 5. Nikon's 2021 Photomicrography Competition Winners +[====================================================================] 6. HTTP Status 418 – I'm a teapot +[====================================================================] 7. H3: Hexagonal hierarchical geospatial indexing system +[--------------------------------------------------------------------] 8. Automatic cipher suite ordering in Go’s crypto/tls +[--------------------------------------------------------------------] 9. Find engineering roles at over 800 YC-funded startups +[--------------------------------------------------------------------] 10. Futarchy: Robin Hanson on prediction markets Ebook saved to "Hacker_News.epub" ``` @@ -29,14 +29,14 @@ go get -u github.com/lapwat/papeer ```sh platform=linux # platform=darwin for MacOS -curl -L https://github.com/lapwat/papeer/releases/download/v0.1.0/papeer-v0.1.0-$platform-amd64 > papeer +curl -L https://github.com/lapwat/papeer/releases/download/v0.2.0/papeer-v0.2.0-$platform-amd64 > papeer chmod +x papeer sudo mv papeer /usr/local/bin ``` ### On Windows -Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.1.0/papeer-v0.1.0-windows-amd64.exe). +Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.2.0/papeer-v0.2.0-windows-amd64.exe). ## Install kindlegen to export websites to MOBI (optional) @@ -69,10 +69,11 @@ Flags: -d, --delay int time to wait before downloading next chapter, in milliseconds (default -1) -f, --format string file format [md, epub, mobi] (default "md") -h, --help help for papeer + --images retrieve images only -i, --include include URL as first chapter, in resursive mode -l, --limit int limit number of chapters, in recursive mode (default -1) - -o, --output string output file - -q, --quiet do not show logs + -o, --offset int skip first chapters, in recursive mode + --output string output file -r, --recursive create one chapter per natigation item -s, --selector string table of content CSS selector --stdout print to standard output diff --git a/book/link.go b/book/link.go index 4f004a9..8b43be1 100644 --- a/book/link.go +++ b/book/link.go @@ -3,11 +3,10 @@ package book type link struct { href string text string - class string } -func NewLink(href, text, class string) link { - return link{href, text, class} +func NewLink(href, text string) link { + return link{href, text} } func (c link) Href() string { @@ -17,7 +16,3 @@ func (c link) Href() string { func (c link) Text() string { return c.text } - -func (c link) Class() string { - return c.class -} diff --git a/book/progress.go b/book/progress.go new file mode 100644 index 0000000..f642092 --- /dev/null +++ b/book/progress.go @@ -0,0 +1,47 @@ +package book + +import ( + "fmt" + + "github.com/gosuri/uiprogress" +) + +type progress struct { + global *uiprogress.Bar + individuals []*uiprogress.Bar +} + +func NewProgress(links []link) progress { + uiprogress.Start() + + global := uiprogress.AddBar(len(links)) + global.AppendFunc(func(b *uiprogress.Bar) string { + return fmt.Sprintf("Chapters %d / %d", b.Current(), len(links)) + }) + + individuals := []*uiprogress.Bar{} + // hide individual bars if more than 50 chapters + if len(links) <= 50 { + for index, link := range links { + bar := uiprogress.AddBar(1) + barText := fmt.Sprintf("%d. %s", index+1, link.text) + bar.AppendFunc(func(b *uiprogress.Bar) string { + return barText + }) + individuals = append(individuals, bar) + } + } + + return progress{global, individuals} +} + +func (p *progress) IncrGlobal() { + p.global.Incr() +} + +func (p *progress) Incr(index int) { + p.global.Incr() + if len(p.individuals) > index { + p.individuals[index].Incr() + } +} diff --git a/book/scraper.go b/book/scraper.go index 3a1027f..6791e50 100644 --- a/book/scraper.go +++ b/book/scraper.go @@ -12,73 +12,66 @@ import ( "github.com/PuerkitoBio/goquery" readability "github.com/go-shiori/go-readability" colly "github.com/gocolly/colly/v2" - "github.com/gosuri/uiprogress" ) -func NewBookFromURL(url, selector string, recursive, include bool, limit, delay int) book { +func NewBookFromURL(url, selector string, recursive, include, images bool, limit, offset, delay int) book { if recursive { - home := NewChapterFromURL(url) - b := New(home.Name(), home.Author()) + chapters := tableOfContent(url, selector, limit, offset, delay, include, images) - chapters := tableOfContent(url, selector, limit, delay) - if include { - b.AddChapter(home) - } + b := New(chapters[0].Name(), chapters[0].Author()) for _, c := range chapters { b.AddChapter(c) } return b } else { - c := NewChapterFromURL(url) + c := NewChapterFromURL(url, images) b := New(c.Name(), c.Author()) b.AddChapter(c) return b } } -func NewChapterFromURL(url string) chapter { +func NewChapterFromURL(url string, images bool) chapter { article, err := readability.FromURL(url, 30*time.Second) if err != nil { log.Fatalf("failed to parse %s, %v\n", url, err) } - return chapter{article.Title, article.Byline, article.Content} + content := strings.ReplaceAll(article.Content, "\n", "") + + if images { + // Load the HTML document + doc, err := goquery.NewDocumentFromReader(strings.NewReader(content)) + if err != nil { + log.Fatal(err) + } + + // Find the review items + doc.Find("img").Each(func(i int, s *goquery.Selection) { + content, _ = goquery.OuterHtml(s) + }) + } + + return chapter{article.Title, article.Byline, content} } -func tableOfContent(url, selector string, limit, delay int) []chapter { +func tableOfContent(url, selector string, limit, offset, delay int, include, images bool) []chapter { base, err := urllib.Parse(url) if err != nil { log.Fatal(err) } - links := GetLinks(base, selector) - if limit != -1 { - limit = int(math.Min(float64(limit), float64(len(links)))) - links = links[:limit] + links, err := GetLinks(base, selector, limit, offset, include) + if err != nil { + log.Fatal(err) } chapters := make([]chapter, len(links)) - - // init global progress bar - uiprogress.Start() - barGlobal := uiprogress.AddBar(len(links)).AppendCompleted().PrependElapsed() - barGlobal.AppendFunc(func(b *uiprogress.Bar) string { - return fmt.Sprintf("Status: %d out of %d chapters", b.Current(), len(links)) - }) - - // init progress bars - bars := []*uiprogress.Bar{} - for index, link := range links { - bar := uiprogress.AddBar(1).AppendCompleted().PrependElapsed() - barText := fmt.Sprintf("%d. %s", index+1, link.text) - bar.AppendFunc(func(b *uiprogress.Bar) string { - return barText - }) - bars = append(bars, bar) - } + progress := NewProgress(links) if delay >= 0 { + for index, link := range links { // and then use it to parse relative URLs u, err := base.Parse(link.href) @@ -86,16 +79,15 @@ func tableOfContent(url, selector string, limit, delay int) []chapter { log.Fatal(err) } - chapters[index] = NewChapterFromURL(u.String()) + chapters[index] = NewChapterFromURL(u.String(), images) + progress.Incr(index) - bars[index].Incr() - barGlobal.Incr() - - // do not wait after downloading last chapter - if index < len(links)-1 { - time.Sleep(time.Duration(delay) * time.Millisecond) + // short sleep for last chapter to let the progress bar update + if index == len(links)-1 { + delay = 100 } + time.Sleep(time.Duration(delay) * time.Millisecond) } } else { @@ -112,10 +104,9 @@ func tableOfContent(url, selector string, limit, delay int) []chapter { log.Fatal(err) } - chapters[index] = NewChapterFromURL(u.String()) + chapters[index] = NewChapterFromURL(u.String(), images) + progress.Incr(index) - bars[index].Incr() - barGlobal.Incr() }(index, l) } wg.Wait() @@ -140,8 +131,7 @@ func GetPath(elm *goquery.Selection) string { return join } - -func GetLinks(url *urllib.URL, selector string) []link { +func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool) ([]link, error) { selectorSet := true if selector == "" { selector = "a" @@ -158,13 +148,17 @@ func GetLinks(url *urllib.URL, selector string) []link { href := e.Attr("href") text := strings.TrimSpace(e.Text) path := GetPath(e.DOM) - class := e.Attr("class") - key := fmt.Sprintf("%s.%s", path, class) + key := path + + // include element class in key if selector is set + if !selectorSet { + class := e.Attr("class") + key = fmt.Sprintf("%s.%s", path, class) + } if selectorSet || text != "" { - pathLinks[key] = append(pathLinks[key], NewLink(href, text, class)) + pathLinks[key] = append(pathLinks[key], NewLink(href, text)) pathCount[key] += len(text) - // pathCount[key]++ if pathCount[key] > pathCount[pathMax] { pathMax = key @@ -172,28 +166,24 @@ func GetLinks(url *urllib.URL, selector string) []link { } }) c.Visit(url.String()) - return pathLinks[pathMax] - // // visit and count link classes - // classesLinks := map[string][]link{} - // classesCount := map[string]int{} - // classMax := "" + links := pathLinks[pathMax] + if len(links) == 0 { + return []link{}, fmt.Errorf("no link found for selector: %s", selector) + } - // c := colly.NewCollector() - // c.OnHTML(selector, func(e *colly.HTMLElement) { - // href := e.Attr("href") - // text := strings.TrimSpace(e.Text) - // class := e.Attr("class") + end := len(links) + if limit != -1 { + end = int(math.Min(float64(limit+offset), float64(len(links)))) + } - // if selectorSet || class != "" && text != "" { - // classesLinks[class] = append(classesLinks[class], NewLink(href, text)) - // classesCount[class]++ + links = links[offset:end] - // if classesCount[class] > classesCount[classMax] { - // classMax = class - // } - // } - // }) - // c.Visit(url.String()) - // return classesLinks[classMax] + if include { + c := NewChapterFromURL(url.String(), false) + l := NewLink(url.String(), c.Name()) + links = append([]link{l}, links...) + } + + return links, nil } diff --git a/cmd/get.go b/cmd/get.go index f02b796..dd8e54e 100644 --- a/cmd/get.go +++ b/cmd/get.go @@ -15,9 +15,9 @@ import ( "github.com/lapwat/papeer/book" ) -var quiet, stdout, recursive, include bool +var stdout, recursive, include, images bool var format, output, selector string -var limit, delay int +var limit, offset, delay int var getCmd = &cobra.Command{ Use: "get", @@ -60,6 +60,10 @@ var getCmd = &cobra.Command{ return errors.New("cannot use limit option if not in recursive mode") } + if cmd.Flags().Changed("offset") && recursive == false { + return errors.New("cannot use offset option if not in recursive mode") + } + if cmd.Flags().Changed("delay") && recursive == false { return errors.New("cannot use delay option if not in recursive mode") } @@ -68,7 +72,7 @@ var getCmd = &cobra.Command{ }, Run: func(cmd *cobra.Command, args []string) { url := args[0] - b := book.NewBookFromURL(url, selector, recursive, include, limit, delay) + b := book.NewBookFromURL(url, selector, recursive, include, images, limit, offset, delay) if len(output) == 0 { // set default output @@ -78,12 +82,16 @@ var getCmd = &cobra.Command{ } if format == "md" { - f, err := os.Create(output) - if err != nil { - log.Fatal(err) - } + var f *os.File + var err error - defer f.Close() + if !stdout { + f, err = os.Create(output) + if err != nil { + log.Fatal(err) + } + defer f.Close() + } for _, c := range b.Chapters() { content, err := md.NewConverter("", true, nil).ConvertString(c.Content()) @@ -96,7 +104,6 @@ var getCmd = &cobra.Command{ if stdout { fmt.Println(text) } else { - _, err := f.WriteString(text) if err != nil { log.Fatal(err) @@ -115,8 +122,16 @@ var getCmd = &cobra.Command{ e.SetAuthor(b.Author()) for _, c := range b.Chapters() { - html := fmt.Sprintf("