fix: display images in epub

bump version in readme
2026-05-25 12:27:20 +00:00 · 2021-12-09 23:48:25 +01:00 · 2021-10-13 00:09:00 +02:00
13 changed files with 173 additions and 760 deletions
--- a/14
+++ b/14
@@ -1,14 +0,0 @@
-format:
-	gofmt -s -w .
-
-test:
-	go test github.com/lapwat/papeer/book
-
-install:
-	go install
-
-clean:
-	find . -maxdepth 1 -not -name 'README.md' -name '*.md' -delete
-	find . -maxdepth 1 -name '*.epub' -delete
-	find . -maxdepth 1 -name '*.mobi' -delete
-	find . -maxdepth 1 -name 'papeer-v*' -delete
--- a/README.md
+++ b/README.md
@@ -19,17 +19,14 @@ Available Commands:
  version     Print the version number of papeer

 Flags:
-  -a, --author string     book author
  -d, --delay int         time to wait before downloading next chapter, in milliseconds (default -1)
  -f, --format string     file format [stdout, md, epub, mobi] (default "stdout")
  -h, --help              help for papeer
      --images            retrieve images only
  -i, --include           include URL as first chapter, in resursive mode
  -l, --limit int         limit number of chapters, in recursive mode (default -1)
-  -n, --name string       book name (default: page title)
  -o, --offset int        skip first chapters, in recursive mode
-      --output string     file name (default: book name)
-  -q, --quiet             hide progress bar
+      --output string     output file
  -r, --recursive         create one chapter per natigation item
  -s, --selector string   table of content CSS selector, in resursive mode
  -t, --threads int       download concurrency, in recursive mode (default -1)
@@ -60,20 +57,13 @@ The `recursive` option lets you extract the table of content of a website, then
 Before trying the `recursive` option, it is a good idea to use the `ls` option, which lets you vizualize the content that will be retrieved. You can use several options to customize the table of content extraction, such as `selector`, `limit`, `offset` and `include`. Type `papeer help` for more information about those options.

 ```sh
-papeer ls https://12factor.net/ -s 'section.concrete > article > h2 > a'
-#  #  NAME                    URL                                    
-#  1  I. Codebase             https://12factor.net/codebase          
-#  2  II. Dependencies        https://12factor.net/dependencies      
-#  3  III. Config             https://12factor.net/config            
-#  4  IV. Backing services    https://12factor.net/backing-services  
-#  5  V. Build, release, run  https://12factor.net/build-release-run 
-#  6  VI. Processes           https://12factor.net/processes         
-#  7  VII. Port binding       https://12factor.net/port-binding      
-#  8  VIII. Concurrency       https://12factor.net/concurrency       
-#  9  IX. Disposability       https://12factor.net/disposability     
-# 10  X. Dev/prod parity      https://12factor.net/dev-prod-parity   
-# 11  XI. Logs                https://12factor.net/logs              
-# 12  XII. Admin processes    https://12factor.net/admin-processes
+papeer ls https://news.ycombinator.com/ --limit=5
+#  #  NAME                                                      URL                                                                                             
+#  1  Tailwind CSS v3.0                                         https://tailwindcss.com/blog/tailwindcss-v3                                                     
+#  2  A molten salt storage solution using sodium hydroxide     https://sifted.eu/articles/salt-energy-storage-seaborg-hyme/                                    
+#  3  HashiCorp IPO today                                       https://www.hashicorp.com/blog/a-new-chapter-for-hashicorp                                      
+#  4  Stack Graphs                                              https://github.blog/2021-12-09-introducing-stack-graphs/
+#  5  ‘Tipping point’ makes partisan polarization irreversible  https://news.cornell.edu/stories/2021/12/tipping-point-makes-partisan-polarization-irreversible 
 ```

 ### Scrape time
@@ -81,21 +71,14 @@ papeer ls https://12factor.net/ -s 'section.concrete > article > h2 > a'
 Once you are satisfied with the table of content listed by the `ls` command, you can actually scrape the content of those pages. You can use the same options that you specified for the `ls` command. In recursive mode, you also have the possibility to use `delay` and `threads` options.

 ```sh
-papeer get https://12factor.net/ --recursive -s 'section.concrete > article > h2 > a' --format=md
-# [======================================>-----------------------------] Chapters 7 / 12
-# [====================================================================] 1. I. Codebase
-# [====================================================================] 2. II. Dependencies
-# [====================================================================] 3. III. Config
-# [====================================================================] 4. IV. Backing services
-# [====================================================================] 5. V. Build, release, run
-# [====================================================================] 6. VI. Processes
-# [====================================================================] 7. VII. Port binding
-# [--------------------------------------------------------------------] 8. VIII. Concurrency
-# [--------------------------------------------------------------------] 9. IX. Disposability
-# [--------------------------------------------------------------------] 10. X. Dev/prod parity
-# [--------------------------------------------------------------------] 11. XI. Logs
-# [--------------------------------------------------------------------] 12. XII. Admin processes
-# Markdown saved to "The_Twelve-Factor_App.md"
+papeer get https://news.ycombinator.com/ --recursive --delay=500 --limit=5 --format=md
+# [========================================>---------------------------] Chapters 3 / 5
+# [====================================================================] 1. Tailwind CSS v3.0
+# [====================================================================] 2. A molten salt storage solution using sodium hydroxide
+# [====================================================================] 3. HashiCorp IPO today
+# [--------------------------------------------------------------------] 4. Stack Graphs
+# [--------------------------------------------------------------------] 5. ‘Tipping point’ makes partisan polarization irreversible
+# Markdown saved to "Hacker News.md"
 ```

 # Installation
@@ -112,15 +95,14 @@ go get -u github.com/lapwat/papeer

 ```sh
 platform=linux # use platform=darwin for MacOS
-release=0.3.3
-curl -L https://github.com/lapwat/papeer/releases/download/v$release/papeer-v$release-$platform-amd64 > papeer
+curl -L https://github.com/lapwat/papeer/releases/download/v0.3.0/papeer-v0.3.0-$platform-amd64 > papeer
 chmod +x papeer
 sudo mv papeer /usr/local/bin
 ```

 ### On Windows

-Download [latest release](https://github.com/lapwat/papeer/releases/download/3/papeer-v0.3.3-windows-amd64.exe).
+Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.3.0/papeer-v0.3.0-windows-amd64.exe).

 ## Install kindlegen to export websites to MOBI (optional)

@@ -152,4 +134,4 @@ You can replace `bash` by your own shell (zsh, fish or powershell).
 - `html-to-markdown` convert HTML to Markdown
 - `go-epub` convert HTML to EPUB
 - `colly` query HTML trees
- `uiprogress` display progress bars
+- `uiprogress` display progress bars
--- a/book/chapter.go
+++ b/book/chapter.go
@@ -1,20 +1,13 @@
 package book

 type chapter struct {
-	body        string
-	name        string
-	author      string
-	content     string
-	subChapters []chapter
-	config      *ScrapeConfig
+	name    string
+	author  string
+	content string
 }

-func NewChapter(body, name, author, content string, subChapters []chapter, config *ScrapeConfig) chapter {
-	return chapter{body, name, author, content, subChapters, config}
-}
-
-func (c chapter) Body() string {
-	return c.body
+func NewChapter(name, author, content string) chapter {
+	return chapter{name, author, content}
 }

 func (c chapter) Name() string {
@@ -28,7 +21,3 @@ func (c chapter) Author() string {
 func (c chapter) Content() string {
 	return c.content
 }
-
-func (c chapter) SubChapters() []chapter {
-	return c.subChapters
-}
--- a/book/format.go
+++ b/book/format.go
@@ -1,143 +0,0 @@
-package book
-
-import (
-	"fmt"
-	"log"
-	"os"
-	"os/exec"
-	"strings"
-
-	md "github.com/JohannesKaufmann/html-to-markdown"
-	"github.com/PuerkitoBio/goquery"
-	epub "github.com/bmaupin/go-epub"
-)
-
-func Filename(name string) string {
-	filename := name
-
-	filename = strings.ReplaceAll(filename, " ", "_")
-	filename = strings.ReplaceAll(filename, "/", "")
-
-	return filename
-}
-
-func ToMarkdown(c chapter) string {
-	markdown := ""
-
-	if c.config.include {
-		// title
-		markdown += fmt.Sprintf("%s\n", c.Name())
-		markdown += fmt.Sprintf("%s\n\n", strings.Repeat("=", len(c.Name())))
-
-		// convert content to markdown
-		content, err := md.NewConverter("", true, nil).ConvertString(c.Content())
-		if err != nil {
-			log.Fatal(err)
-		}
-		markdown += fmt.Sprintf("%s\n\n\n", content)
-	}
-
-	for _, sc := range c.SubChapters() {
-		// subchapters content
-		markdown += fmt.Sprintf("%s\n\n\n", ToMarkdown(sc))
-	}
-
-	return markdown
-}
-
-func ToEpub(c chapter, filename string) string {
-	if len(filename) == 0 {
-		filename = fmt.Sprintf("%s.epub", Filename(c.Name()))
-	}
-
-	// init ebook
-	e := epub.NewEpub(c.Name())
-	e.SetAuthor(c.Author())
-
-	AppendToEpub(e, c)
-
-	err := e.Write(filename)
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	return filename
-}
-
-func AppendToEpub(e *epub.Epub, c chapter) {
-	content := ""
-
-	if c.config.include {
-
-		if c.config.imagesOnly == false {
-			content = c.Content()
-		}
-
-		// parse content
-		doc, err := goquery.NewDocumentFromReader(strings.NewReader(c.Content()))
-		if err != nil {
-			log.Fatal(err)
-		}
-
-		// download images and replace src in img tags of content
-		doc.Find("img").Each(func(i int, s *goquery.Selection) {
-			src, _ := s.Attr("src")
-			src = strings.Split(src, "?")[0] // remove query part
-			imagePath, _ := e.AddImage(src, "")
-
-			if c.config.imagesOnly {
-				imageTag, _ := goquery.OuterHtml(s)
-				content += strings.Replace(imageTag, src, imagePath, 1)
-			} else {
-				content = strings.Replace(content, src, imagePath, 1)
-			}
-		})
-
-		html := ""
-		// add title only if imagesOnly = false
-		if c.config.imagesOnly == false {
-			html += fmt.Sprintf("<h1>%s</h1>", c.Name())
-		}
-		html += content
-
-		//  write to epub file
-		_, err = e.AddSection(html, c.Name(), "", "")
-		if err != nil {
-			log.Fatal(err)
-		}
-
-	}
-
-	for _, sc := range c.SubChapters() {
-		AppendToEpub(e, sc)
-	}
-}
-
-func ToMobi(c chapter, filename string) string {
-	if len(filename) == 0 {
-		filename = fmt.Sprintf("%s.mobi", Filename(c.Name()))
-	} else {
-
-		// add .mobi extension if not specified
-		if strings.HasSuffix(filename, ".mobi") == false {
-			filename = fmt.Sprintf("%s.mobi", filename)
-		}
-
-	}
-
-	filenameEPUB := strings.ReplaceAll(filename, ".mobi", ".epub")
-	ToEpub(c, filenameEPUB)
-
-	exec.Command("kindlegen", filenameEPUB).Run()
-	// exec command always return status 1 even if it succeed
-	// if err != nil {
-	// 	log.Fatal(err)
-	// }
-
-	err := os.Remove(filenameEPUB)
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	return filename
-}
--- a/book/format_test.go
+++ b/book/format_test.go
@@ -1,79 +0,0 @@
-package book
-
-import (
-	"errors"
-	"os"
-	"testing"
-)
-
-func TestFilename(t *testing.T) {
-
-	got := Filename("This is a chapter / book")
-	want := "This_is_a_chapter__book"
-
-	if got != want {
-		t.Errorf("got %q, wanted %q", got, want)
-	}
-
-}
-
-func TestToMarkdown(t *testing.T) {
-
-	c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
-
-	got := ToMarkdown(c)
-	want := "Books\n=====\n\n- [Discours de la Méthode](https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/)clock 98 min read -\n1637\n\n- [The Twelve-Factor App](https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/)clock 22 min read -\n2011\n\n\n"
-
-	if got != want {
-		t.Errorf("got %q, wanted %q", got, want)
-	}
-
-}
-
-func TestToEpub(t *testing.T) {
-
-	filename := "Books.epub"
-	c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
-	ToEpub(c, "")
-
-	if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) {
-		t.Errorf("%s does not exist: %v", filename, err)
-	} else {
-		if err := os.Remove(filename); err != nil {
-			t.Errorf("cannot remove %v: %v", filename, err)
-		}
-	}
-
-}
-
-func TestToEpubFilename(t *testing.T) {
-
-	filename := "ebook.epub"
-	c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
-	ToEpub(c, filename)
-
-	if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) {
-		t.Errorf("%s does not exist: %v", filename, err)
-	} else {
-		if err := os.Remove(filename); err != nil {
-			t.Errorf("cannot remove %v: %v", filename, err)
-		}
-	}
-
-}
-
-func TestToMobi(t *testing.T) {
-
-	filename := "ebook.mobi"
-	c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
-	ToMobi(c, filename)
-
-	if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) {
-		t.Errorf("%s does not exist: %v", filename, err)
-	} else {
-		if err := os.Remove(filename); err != nil {
-			t.Errorf("cannot remove %v: %v", filename, err)
-		}
-	}
-
-}
--- a/book/progress.go
+++ b/book/progress.go
@@ -2,7 +2,6 @@ package book

 import (
 	"fmt"
-	"strings"

 	"github.com/gosuri/uiprogress"
 )
@@ -12,22 +11,20 @@ type progress struct {
 	individuals []*uiprogress.Bar
 }

-func NewProgress(links []link, parent string, depth int) progress {
+func NewProgress(links []link) progress {
 	uiprogress.Start()

 	global := uiprogress.AddBar(len(links))
-	indentGlobal := strings.Repeat("> ", depth)
 	global.AppendFunc(func(b *uiprogress.Bar) string {
-		return fmt.Sprintf("%v%v (%v / %v)", indentGlobal, parent, b.Current(), len(links))
+		return fmt.Sprintf("Chapters %d / %d", b.Current(), len(links))
 	})

 	// hide individual bars if more than 50 chapters
 	individuals := []*uiprogress.Bar{}
-	indent := strings.Repeat("- ", depth)
 	if len(links) <= 50 {
 		for index, link := range links {
 			bar := uiprogress.AddBar(1)
-			barText := fmt.Sprintf("%v#%v %v", indent, index+1, link.Text())
+			barText := fmt.Sprintf("%d. %s", index+1, link.text)
 			bar.AppendFunc(func(b *uiprogress.Bar) string {
 				return barText
 			})
@@ -38,22 +35,13 @@ func NewProgress(links []link, parent string, depth int) progress {
 	return progress{global, individuals}
 }

-func (p *progress) IncrementGlobal() {
+func (p *progress) IncrGlobal() {
 	p.global.Incr()
 }

-func (p *progress) Increment(index int) {
-	p.IncrementGlobal()
+func (p *progress) Incr(index int) {
+	p.global.Incr()
 	if len(p.individuals) > index {
 		p.individuals[index].Incr()
 	}
 }
-
-func (p *progress) UpdateName(index int, name string) {
-	if len(p.individuals) > index {
-		barText := fmt.Sprintf("%s", name)
-		p.individuals[index].AppendFunc(func(b *uiprogress.Bar) string {
-			return barText
-		})
-	}
-}
--- a/book/scraper.go
+++ b/book/scraper.go
@@ -1,12 +1,9 @@
 package book

 import (
-	"bytes"
 	"fmt"
-	"io"
 	"log"
 	"math"
-	"net/http"
 	urllib "net/url"
 	"strings"
 	"sync"
@@ -17,247 +14,63 @@ import (
 	colly "github.com/gocolly/colly/v2"
 )

-type ScrapeConfig struct {
-	depth      int
-	selector   string
-	limit      int
-	offset     int
-	delay      int
-	threads    int
-	include    bool
-	imagesOnly bool
-}
-
-func NewScrapeConfig() *ScrapeConfig {
-	return &ScrapeConfig{0, "", -1, 0, -1, -1, true, false}
-}
-
-func NewScrapeConfigsAjin() []*ScrapeConfig {
-	config0 := NewScrapeConfig()
-	config0.depth = 0
-	config0.selector = ".dt>a"
-	config0.limit = 3
-	config0.offset = 0
-	config0.delay = 5000
-	config0.include = false
-
-	config1 := NewScrapeConfig()
-	config1.depth = 1
-	config1.selector = ".nav_apb>a"
-	config1.limit = 3
-	config1.offset = 1
-	config1.delay = 5000
-	config1.include = false
-
-	config2 := NewScrapeConfig()
-	config2.depth = 2
-	config2.imagesOnly = true
-
-	return []*ScrapeConfig{config0, config1, config2}
-}
-
-func NewScrapeConfigsWikipedia() []*ScrapeConfig {
-	config0 := NewScrapeConfig()
-	config0.depth = 0
-	config0.threads = -1
-	config0.include = true
-
-	config1 := NewScrapeConfig()
-	config1.depth = 1
-	config1.include = true
-
-	return []*ScrapeConfig{config0, config1}
-}
-
-func NewScrapeConfigFake() *ScrapeConfig {
-	config := NewScrapeConfig()
-	config.include = false
-
-	return config
-}
-
-func NewBookFromURL(url, selector, name, author string, recursive, include, imagesOnly, quiet bool, limit, offset, delay, threads int) book {
-	config1 := NewScrapeConfig()
-	config1.imagesOnly = imagesOnly
-
-	var chapters []chapter
-	var home chapter
-
+func NewBookFromURL(url, selector string, recursive, include, images bool, limit, offset, delay, threads int) book {
 	if recursive {
-		config2 := NewScrapeConfig()
-		config2.selector = selector
-		config2.limit = limit
-		config2.offset = offset
-		config2.delay = delay
-		config2.threads = threads
-		config2.include = include
-		config2.imagesOnly = imagesOnly
-		chapters, home = tableOfContent(url, config2, config1, quiet)
+		chapters := tableOfContent(url, selector, limit, offset, delay, threads, include, images)
+
+		b := New(chapters[0].Name(), chapters[0].Author())
+		for _, c := range chapters {
+			b.AddChapter(c)
+		}
+
+		return b
 	} else {
-		chapters = []chapter{NewChapterFromURL(url, []*ScrapeConfig{config1}, 0, func(index int, name string) {})}
-		home = chapters[0]
-	}
-
-	if len(name) == 0 {
-		name = home.Name()
-	}
-
-	if len(author) == 0 {
-		author = home.Author()
-	}
-
-	b := New(name, author)
-	for _, c := range chapters {
+		c := NewChapterFromURL(url, images)
+		b := New(c.Name(), c.Author())
 		b.AddChapter(c)
+		return b
 	}
-
-	return b
 }

-func NewChapterFromURL(url string, configs []*ScrapeConfig, index int, updateProgressBarName func(index int, name string)) chapter {
-	config := configs[0]
-
-	base, err := urllib.Parse(url)
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	// get page body
-	response, err := http.Get(url)
-	if err != nil {
-		log.Fatal(err)
-	}
-	defer response.Body.Close()
-
-	// duplicate response stream
-	readabilityReader := &bytes.Buffer{}
-	bodyReader := io.TeeReader(response.Body, readabilityReader)
-
-	// extract HTML body
-	body, err := io.ReadAll(bodyReader)
-
-	// extract article content and metadata
-	article, err := readability.FromReader(readabilityReader, base)
+func NewChapterFromURL(url string, images bool) chapter {
+	article, err := readability.FromURL(url, 30*time.Second)
 	if err != nil {
 		log.Fatalf("failed to parse %s, %v\n", url, err)
 	}
-	name := article.Title

-	// notify progress bar with new name
-	updateProgressBarName(index, name)
+	content := strings.ReplaceAll(article.Content, "\n", "")

-	subchapters := []chapter{}
-	if len(configs) > 1 {
-		// add subchapters
-
-		links, _, err := GetLinks(base, config.selector, config.limit, config.offset, false)
+	if images {
+		// parse html content
+		doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
 		if err != nil {
 			log.Fatal(err)
 		}

-		subchapters = make([]chapter, len(links))
-		progress := NewProgress(links, name, config.depth)
-
-		if config.delay >= 0 {
-
-			// synchronous mode
-			for index, link := range links {
-				// and then use it to parse relative URLs
-				u, err := base.Parse(link.href)
-				if err != nil {
-					log.Fatal(err)
-				}
-
-				sc := NewChapterFromURL(u.String(), configs[1:], index, progress.UpdateName)
-				subchapters[index] = sc
-				progress.Increment(index)
-
-				time.Sleep(time.Duration(config.delay) * time.Millisecond)
-			}
-
-		} else {
-			// asynchronous mode
-			var wg sync.WaitGroup
-
-			threads := config.threads
-			if threads == -1 {
-				threads = len(links)
-			}
-			semaphore := make(chan bool, threads)
-
-			for index, l := range links {
-
-				wg.Add(1)
-				semaphore <- true
-
-				go func(index int, l link) {
-					defer wg.Done()
-
-					// and then use it to parse relative URLs
-					u, err := base.Parse(l.href)
-					if err != nil {
-						log.Fatal(err)
-					}
-
-					sc := NewChapterFromURL(u.String(), configs[1:], index, progress.UpdateName)
-					subchapters[index] = sc
-					progress.Increment(index)
-
-					<-semaphore
-				}(index, l)
-			}
-			wg.Wait()
-		}
+		// extract images only
+		content = ""
+		doc.Find("img").Each(func(i int, s *goquery.Selection) {
+			newContent, _ := goquery.OuterHtml(s)
+			content += newContent
+		})
 	}

-	content := ""
-	if config.include {
-
-		// we care about the content only if we include this level
-		content = article.Content
-
-		// extract images
-		if config.imagesOnly {
-
-			// parse HTML
-			doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
-			if err != nil {
-				log.Fatal(err)
-			}
-
-			// append every image to content
-			content = ""
-			doc.Find("img").Each(func(i int, s *goquery.Selection) {
-				imageTag, _ := goquery.OuterHtml(s)
-				imageTag = strings.ReplaceAll(imageTag, "\n", "")
-
-				content += imageTag
-			})
-
-		}
-	}
-
-	return chapter{string(body), name, article.Byline, content, subchapters, config}
+	return chapter{article.Title, article.Byline, content}
 }

-func tableOfContent(url string, config *ScrapeConfig, subConfig *ScrapeConfig, quiet bool) ([]chapter, chapter) {
+func tableOfContent(url, selector string, limit, offset, delay, threads int, include, images bool) []chapter {
 	base, err := urllib.Parse(url)
 	if err != nil {
 		log.Fatal(err)
 	}

-	links, home, err := GetLinks(base, config.selector, config.limit, config.offset, config.include)
+	links, err := GetLinks(base, selector, limit, offset, include)
 	if err != nil {
 		log.Fatal(err)
 	}

 	chapters := make([]chapter, len(links))
-	delay := config.delay
-
-	var p progress
-	if quiet == false {
-		p = NewProgress(links, "", 0)
-	}
+	progress := NewProgress(links)

 	if delay >= 0 {
 		// synchronous mode
@@ -269,11 +82,8 @@ func tableOfContent(url string, config *ScrapeConfig, subConfig *ScrapeConfig, q
 				log.Fatal(err)
 			}

-			chapters[index] = NewChapterFromURL(u.String(), []*ScrapeConfig{subConfig}, 0, func(index int, name string) {})
-			
-			if quiet == false {
-				p.Increment(index)
-			}
+			chapters[index] = NewChapterFromURL(u.String(), images)
+			progress.Incr(index)

 			// short sleep for last chapter to let the progress bar update
 			if index == len(links)-1 {
@@ -287,7 +97,6 @@ func tableOfContent(url string, config *ScrapeConfig, subConfig *ScrapeConfig, q
 		// asynchronous mode
 		var wg sync.WaitGroup

-		threads := config.threads
 		if threads == -1 {
 			threads = len(links)
 		}
@@ -307,19 +116,15 @@ func tableOfContent(url string, config *ScrapeConfig, subConfig *ScrapeConfig, q
 					log.Fatal(err)
 				}

-				chapters[index] = NewChapterFromURL(u.String(), []*ScrapeConfig{subConfig}, 0, func(index int, name string) {})
-
-				if quiet == false {
-					p.Increment(index)
-				}
+				chapters[index] = NewChapterFromURL(u.String(), images)
+				progress.Incr(index)

 				<-semaphore
 			}(index, l)
 		}
 		wg.Wait()
 	}
-
-	return chapters, home
+	return chapters
 }

 func GetPath(elm *goquery.Selection) string {
@@ -339,7 +144,7 @@ func GetPath(elm *goquery.Selection) string {
 	return join
 }

-func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool) ([]link, chapter, error) {
+func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool) ([]link, error) {
 	selectorSet := true
 	if selector == "" {
 		selector = "a"
@@ -377,7 +182,7 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool)

 	links := pathLinks[pathMax]
 	if len(links) == 0 {
-		return []link{}, chapter{}, fmt.Errorf("no link found for selector: %s", selector)
+		return []link{}, fmt.Errorf("no link found for selector: %s", selector)
 	}

 	end := len(links)
@@ -387,12 +192,11 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool)

 	links = links[offset:end]

-	home := NewChapterFromURL(url.String(), []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
-
 	if include {
-		l := NewLink(url.String(), home.Name())
+		c := NewChapterFromURL(url.String(), false)
+		l := NewLink(url.String(), c.Name())
 		links = append([]link{l}, links...)
 	}

-	return links, home, nil
+	return links, nil
 }
--- a/book/scraper_test.go
+++ b/book/scraper_test.go
@@ -1,184 +0,0 @@
-package book
-
-import (
-	"testing"
-	"time"
-)
-
-func TestBody(t *testing.T) {
-
-	config := NewScrapeConfig()
-	c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config}, 0, func(index int, name string) {})
-
-	got := c.Body()
-	want := "<!doctype html>\n<html lang=\"en-us\">\n  <head>\n    <title>Books</title>\n    <link rel=\"shortcut icon\" href=\"/favicon.ico\" />\n    <meta charset=\"utf-8\" />\n    <meta name=\"generator\" content=\"Hugo 0.59.1\" />\n    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />\n    <meta name=\"author\" content=\"John Doe\" />\n    <meta name=\"description\" content=\" \" />\n    <link rel=\"stylesheet\" href=\"https://books.lapw.at/css/main.min.88e7083eff65effb7485b6e6f38d10afbec25093a6fac42d734ce9024d3defbd.css\" />\n\n    \n    <meta name=\"twitter:card\" content=\"summary\"/>\n<meta name=\"twitter:title\" content=\"Books\"/>\n<meta name=\"twitter:description\" content=\" \"/>\n\n    <meta property=\"og:title\" content=\"Books\" />\n<meta property=\"og:description\" content=\" \" />\n<meta property=\"og:type\" content=\"website\" />\n<meta property=\"og:url\" content=\"https://books.lapw.at/\" />\n\n\n\n  </head>\n  <body>\n    <header class=\"app-header\">\n      <a href=\"https://books.lapw.at/\"><img class=\"app-header-avatar\" src=\"/book.svg\" alt=\"John Doe\" /></a>\n      <h1>Books</h1>\n      <p> </p>\n      <div class=\"app-header-social\">\n        \n      </div>\n    </header>\n    <main class=\"app-container\">\n      \n  <article>\n    <h1>Books</h1>\n    <ul class=\"posts-list\">\n      \n        <li class=\"posts-list-item\">\n          <a class=\"posts-list-item-title\" href=\"https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/\">Discours de la Méthode</a>\n          <span class=\"posts-list-item-description\">\n            <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\" class=\"icon icon-clock\">\n  <title>clock</title>\n  <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 98 min read -\n            1637\n          </span>\n        </li>\n      \n        <li class=\"posts-list-item\">\n          <a class=\"posts-list-item-title\" href=\"https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/\">The Twelve-Factor App</a>\n          <span class=\"posts-list-item-description\">\n            <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\" class=\"icon icon-clock\">\n  <title>clock</title>\n  <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 22 min read -\n            2011\n          </span>\n        </li>\n      \n    </ul>\n    \n\n\n\n  </article>\n\n    </main>\n  </body>\n</html>\n"
-
-	if got != want {
-		t.Errorf("got %v, wanted %v", got, want)
-	}
-
-}
-
-func TestName(t *testing.T) {
-
-	config := NewScrapeConfig()
-	c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config}, 0, func(index int, name string) {})
-
-	got := c.Name()
-	want := "Books"
-
-	if got != want {
-		t.Errorf("got %v, wanted %v", got, want)
-	}
-
-}
-
-func TestAuthor(t *testing.T) {
-
-	config := NewScrapeConfig()
-	c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config}, 0, func(index int, name string) {})
-
-	got := c.Author()
-	want := "John Doe"
-
-	if got != want {
-		t.Errorf("got %v, wanted %v", got, want)
-	}
-
-}
-
-func TestContent(t *testing.T) {
-
-	config := NewScrapeConfig()
-	c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config}, 0, func(index int, name string) {})
-
-	got := c.Content()
-	want := "<div id=\"readability-page-1\" class=\"page\">\n    \n    <main>\n      \n  <article>\n    \n    <ul>\n      \n        <li>\n          <a href=\"https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/\">Discours de la Méthode</a>\n          <span>\n            <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\">\n  <title>clock</title>\n  <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 98 min read -\n            1637\n          </span>\n        </li>\n      \n        <li>\n          <a href=\"https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/\">The Twelve-Factor App</a>\n          <span>\n            <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\">\n  <title>clock</title>\n  <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 22 min read -\n            2011\n          </span>\n        </li>\n      \n    </ul>\n    \n\n\n\n  </article>\n\n    </main>\n  \n\n</div>"
-
-	if got != want {
-		t.Errorf("got %v, wanted %v", got, want)
-	}
-
-}
-
-func TestDelay(t *testing.T) {
-
-	config0 := NewScrapeConfig()
-	config0.delay = 500
-
-	config1 := NewScrapeConfig()
-
-	start := time.Now()
-	NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {})
-	elapsed := time.Since(start)
-
-	got := elapsed
-	want := time.Duration(500) * time.Millisecond
-
-	if got < want {
-		t.Errorf("got %v, wanted min %v", got, want)
-	}
-
-}
-
-func TestContentImagesOnly(t *testing.T) {
-
-	config := NewScrapeConfig()
-	config.imagesOnly = true
-
-	c := NewChapterFromURL("https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/", []*ScrapeConfig{config}, 0, func(index int, name string) {})
-
-	got := c.Content()
-	want := "<img src=\"https://books.lapw.at/images/codebase-deploys.png\" alt=\"One codebase maps to many deploys\"/><img src=\"https://books.lapw.at/images/attached-resources.png\" alt=\"A production deploy attached to four backing services.\"/><img src=\"https://books.lapw.at/images/release.png\" alt=\"Code becomes a build, which is combined with config to create a release.\"/><img src=\"https://books.lapw.at/images/process-types.png\" alt=\"Scale is expressed as running processes, workload diversity is expressed as process types.\"/>"
-
-	if got != want {
-		t.Errorf("got %v, wanted %v", got, want)
-	}
-
-}
-
-func TestSubChapters(t *testing.T) {
-
-	config0 := NewScrapeConfig()
-	config1 := NewScrapeConfig()
-
-	c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {})
-
-	got := len(c.SubChapters())
-	want := 2
-
-	if got != want {
-		t.Errorf("got %v, wanted %v", got, want)
-	}
-
-}
-
-func TestSubChaptersSelector(t *testing.T) {
-
-	config0 := NewScrapeConfig()
-	config0.selector = "section.concrete > article > h2 > a"
-
-	config1 := NewScrapeConfig()
-
-	c := NewChapterFromURL("https://12factor.net/", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {})
-
-	got := len(c.SubChapters())
-	want := 12
-
-	if got != want {
-		t.Errorf("got %v, wanted %v", got, want)
-	}
-
-}
-
-func TestSubChaptersLimit(t *testing.T) {
-
-	config0 := NewScrapeConfig()
-	config0.limit = 1
-
-	config1 := NewScrapeConfig()
-
-	c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {})
-
-	got := len(c.SubChapters())
-	want := 1
-
-	if got != want {
-		t.Errorf("got %v, wanted %v", got, want)
-	}
-
-}
-
-func TestSubChaptersLimitOver(t *testing.T) {
-
-	config0 := NewScrapeConfig()
-	config0.limit = 3
-
-	config1 := NewScrapeConfig()
-
-	c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {})
-
-	got := len(c.SubChapters())
-	want := 2
-
-	if got != want {
-		t.Errorf("got %v, wanted %v", got, want)
-	}
-
-}
-
-func TestNotInclude(t *testing.T) {
-
-	config := NewScrapeConfig()
-	config.include = false
-
-	c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config}, 0, func(index int, name string) {})
-
-	got := c.Content()
-	want := ""
-
-	if got != want {
-		t.Errorf("got %v, wanted %v", got, want)
-	}
-
-}
--- a/cmd/get.go
+++ b/cmd/get.go
@@ -5,15 +5,19 @@ import (
 	"fmt"
 	"log"
 	"os"
+	"os/exec"
 	"strings"

+	md "github.com/JohannesKaufmann/html-to-markdown"
+	"github.com/PuerkitoBio/goquery"
+	epub "github.com/bmaupin/go-epub"
 	"github.com/spf13/cobra"

 	"github.com/lapwat/papeer/book"
 )

-var recursive, include, images, quiet bool
-var format, output, selector, name, author string
+var recursive, include, images bool
+var format, output, selector string
 var limit, offset, delay, threads int

 var getCmd = &cobra.Command{
@@ -73,48 +77,123 @@ var getCmd = &cobra.Command{
 	},
 	Run: func(cmd *cobra.Command, args []string) {
 		url := args[0]
-		b := book.NewBookFromURL(url, selector, name, author, recursive, include, images, quiet, limit, offset, delay, threads)
+		b := book.NewBookFromURL(url, selector, recursive, include, images, limit, offset, delay, threads)

-		fakeConfig := book.NewScrapeConfigFake()
-		fakeChapter := book.NewChapter("", b.Name(), b.Author(), "", b.Chapters(), fakeConfig)
+		if len(output) == 0 {
+			// set default output
+			output = strings.ReplaceAll(b.Name(), " ", "_")
+			output = strings.ReplaceAll(output, "/", "")
+			output = fmt.Sprintf("%s.%s", output, format)
+		}

 		if format == "stdout" {
-			// TODO: ToMarkdownString
-			markdown := book.ToMarkdown(fakeChapter)
-			fmt.Println(markdown)
+
+			for _, c := range b.Chapters() {
+				// convert to markdown
+				content, err := md.NewConverter("", true, nil).ConvertString(c.Content())
+				if err != nil {
+					log.Fatal(err)
+				}
+
+				text := fmt.Sprintf("%s\n%s\n\n%s\n\n\n", c.Name(), strings.Repeat("=", len(c.Name())), content)
+
+				// write to stdout
+				fmt.Println(text)
+			}
+
 		}

 		if format == "md" {
-			// TODO: ToMarkdownFile
-			markdown := book.ToMarkdown(fakeChapter)

-			if len(output) == 0 {
-				filename := book.Filename(fakeChapter.Name())
-				output = fmt.Sprintf("%s.md", filename)
-			}
-
-			// write to file
+			// create markdown file
 			f, err := os.Create(output)
 			if err != nil {
 				log.Fatal(err)
 			}
-			_, err2 := f.WriteString(markdown)
-			if err2 != nil {
-				log.Fatal(err2)
+			defer f.Close()
+
+			for _, c := range b.Chapters() {
+				// convert to markdown
+				content, err := md.NewConverter("", true, nil).ConvertString(c.Content())
+				if err != nil {
+					log.Fatal(err)
+				}
+
+				text := fmt.Sprintf("%s\n%s\n\n%s\n\n\n", c.Name(), strings.Repeat("=", len(c.Name())), content)
+
+				// write to markdown file
+				_, err = f.WriteString(text)
+				if err != nil {
+					log.Fatal(err)
+				}
 			}
-			f.Close()

 			fmt.Printf("Markdown saved to \"%s\"\n", output)
 		}

 		if format == "epub" {
-			output = book.ToEpub(fakeChapter, output)
+			e := epub.NewEpub(b.Name())
+			e.SetAuthor(b.Author())
+
+			for _, c := range b.Chapters() {
+				// parse content 
+				doc, err := goquery.NewDocumentFromReader(strings.NewReader(c.Content()))
+				if err != nil {
+					log.Fatal(err)
+				}
+
+				// retrieve images and download it
+				contentWithLocalImages := c.Content()
+				doc.Find("img").Each(func(i int, s *goquery.Selection) {
+					src, _ := s.Attr("src")
+					imagePath, _ := e.AddImage(src, "")
+
+					contentWithLocalImages = strings.ReplaceAll(contentWithLocalImages, src, imagePath)
+				})
+
+				html := fmt.Sprintf("<h1>%s</h1>%s", c.Name(), contentWithLocalImages)
+				_, err = e.AddSection(html, c.Name(), "", "")
+				if err != nil {
+					log.Fatal(err)
+				}
+
+			}
+
+			err := e.Write(output)
+			if err != nil {
+				log.Fatal(err)
+			}
+
 			fmt.Printf("Ebook saved to \"%s\"\n", output)
 		}

 		if format == "mobi" {
-			output = book.ToMobi(fakeChapter, output)
+			e := epub.NewEpub(b.Name())
+			e.SetAuthor(b.Author())
+
+			for _, chapter := range b.Chapters() {
+				e.AddSection(chapter.Content(), chapter.Name(), "", "")
+			}
+
+			outputEPUB := strings.ReplaceAll(output, ".mobi", ".epub")
+
+			err := e.Write(outputEPUB)
+			if err != nil {
+				log.Fatal(err)
+			}
+
+			exec.Command("kindlegen", outputEPUB).Run()
+			// exec command always return status 1 even if it fails
+			// if err != nil {
+			// 	log.Fatal(err)
+			// }
+
 			fmt.Printf("Ebook saved to \"%s\"\n", output)
+
+			err2 := os.Remove(outputEPUB)
+			if err2 != nil {
+				log.Fatal(err2)
+			}
 		}
 	},
 }
--- a/cmd/list.go
+++ b/cmd/list.go
@@ -27,7 +27,7 @@ var listCmd = &cobra.Command{
 			log.Fatal(err)
 		}

-		links, _, err := book.GetLinks(base, selector, limit, offset, include)
+		links, err := book.GetLinks(base, selector, limit, offset, include)
 		if err != nil {
 			log.Fatal(err)
 		}
--- a/cmd/root.go
+++ b/cmd/root.go
@@ -23,15 +23,12 @@ func Execute() {
 }

 func init() {
-	rootCmd.PersistentFlags().StringVarP(&name, "name", "n", "", "book name (default: page title)")
-	rootCmd.PersistentFlags().StringVarP(&author, "author", "a", "", "book author")
 	rootCmd.PersistentFlags().StringVarP(&format, "format", "f", "stdout", "file format [stdout, md, epub, mobi]")
-	rootCmd.PersistentFlags().StringVarP(&output, "output", "", "", "file name (default: book name)")
+	rootCmd.PersistentFlags().StringVarP(&output, "output", "", "", "output file")
 	rootCmd.PersistentFlags().StringVarP(&selector, "selector", "s", "", "table of content CSS selector, in resursive mode")
 	rootCmd.PersistentFlags().BoolVarP(&recursive, "recursive", "r", false, "create one chapter per natigation item")
 	rootCmd.PersistentFlags().BoolVarP(&include, "include", "i", false, "include URL as first chapter, in resursive mode")
 	rootCmd.PersistentFlags().BoolVarP(&images, "images", "", false, "retrieve images only")
-	rootCmd.PersistentFlags().BoolVarP(&quiet, "quiet", "q", false, "hide progress bar")
 	rootCmd.PersistentFlags().IntVarP(&limit, "limit", "l", -1, "limit number of chapters, in recursive mode")
 	rootCmd.PersistentFlags().IntVarP(&offset, "offset", "o", 0, "skip first chapters, in recursive mode")
 	rootCmd.PersistentFlags().IntVarP(&delay, "delay", "d", -1, "time to wait before downloading next chapter, in milliseconds")
--- a/cmd/version.go
+++ b/cmd/version.go
@@ -14,6 +14,6 @@ var versionCmd = &cobra.Command{
 	Use:   "version",
 	Short: "Print the version number of papeer",
 	Run: func(cmd *cobra.Command, args []string) {
-		fmt.Println("papeer v0.3.3")
+		fmt.Println("papeer v0.3.0")
 	},
 }
--- a/release.sh
+++ b/release.sh
@@ -3,18 +3,12 @@
 version=$1
 platforms=("linux/amd64" "darwin/amd64" "windows/amd64")

-if [ "$#" -ne 1 ]; then
-    echo "Illegal number of parameters"
-    echo "Usage: ./release.sh X.X.X"
-    exit 1
-fi
-
 for platform in "${platforms[@]}"
 do
    platform_split=(${platform//\// })
    GOOS=${platform_split[0]}
    GOARCH=${platform_split[1]}
-    output_name='papeer-v'$version'-'$GOOS'-'$GOARCH
+    output_name='papeer-'$version'-'$GOOS'-'$GOARCH
    if [ $GOOS = "windows" ]; then
        output_name+='.exe'
    fi
Author	SHA1	Message	Date
lapwat	2be32cd50f	fix: display images in epub	2021-12-09 23:48:25 +01:00
lapwat	4b9840e356	bump version in readme	2021-10-13 00:09:00 +02:00