diff --git a/README.md b/README.md
index 5a0d812..c7db758 100644
--- a/README.md
+++ b/README.md
@@ -111,14 +111,15 @@ go get -u github.com/lapwat/papeer
```sh
platform=linux # use platform=darwin for MacOS
-curl -L https://github.com/lapwat/papeer/releases/download/v0.3.1/papeer-v0.3.1-$platform-amd64 > papeer
+release=0.3.2
+curl -L https://github.com/lapwat/papeer/releases/download/v$release/papeer-v$release-$platform-amd64 > papeer
chmod +x papeer
sudo mv papeer /usr/local/bin
```
### On Windows
-Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.3.1/papeer-v0.3.1-windows-amd64.exe).
+Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.3.2/papeer-v0.3.2-windows-amd64.exe).
## Install kindlegen to export websites to MOBI (optional)
diff --git a/book/format.go b/book/format.go
index 7d25094..6a7d576 100644
--- a/book/format.go
+++ b/book/format.go
@@ -22,89 +22,100 @@ func Filename(name string) string {
}
func ToMarkdown(c chapter) string {
+ markdown := ""
- // make title
- underline := strings.Repeat("=", len(c.Name()))
- title := fmt.Sprintf("%s\n%s", c.Name(), underline)
+ if c.config.include {
+ // title
+ markdown += fmt.Sprintf("%s\n", c.Name())
+ markdown += fmt.Sprintf("%s\n\n", strings.Repeat("=", len(c.Name())))
- // convert content to markdown
- content, err := md.NewConverter("", true, nil).ConvertString(c.Content())
- if err != nil {
- log.Fatal(err)
+ // convert content to markdown
+ content, err := md.NewConverter("", true, nil).ConvertString(c.Content())
+ if err != nil {
+ log.Fatal(err)
+ }
+ markdown += fmt.Sprintf("%s\n\n\n", content)
}
- // merge title and content
- content = fmt.Sprintf("%s\n\n%s", title, content)
-
for _, sc := range c.SubChapters() {
- // merge subchapters
- content = fmt.Sprintf("%s\n\n\n%s", content, ToMarkdown(sc))
+ // subchapters content
+ markdown += fmt.Sprintf("%s\n\n\n", ToMarkdown(sc))
}
- return content
+ return markdown
}
func ToEpub(c chapter, filename string) string {
if len(filename) == 0 {
- filename = fmt.Sprintf("%s.epub", c.Name())
+ filename = fmt.Sprintf("%s.epub", Filename(c.Name()))
}
// init ebook
e := epub.NewEpub(c.Name())
e.SetAuthor(c.Author())
- AppendToEpub(e, c, false)
+ AppendToEpub(e, c)
err := e.Write(filename)
if err != nil {
log.Fatal(err)
}
- fmt.Printf("Ebook saved to \"%s\"\n", filename)
-
return filename
}
-func AppendToEpub(e *epub.Epub, c chapter, imagesOnly bool) {
+func AppendToEpub(e *epub.Epub, c chapter) {
content := ""
- if imagesOnly == false {
- content = c.Content()
- }
+ if c.config.include {
- // parse content
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(c.Content()))
- if err != nil {
- log.Fatal(err)
- }
-
- // download images and replace src in img tags of content
- doc.Find("img").Each(func(i int, s *goquery.Selection) {
- src, _ := s.Attr("src")
- imagePath, _ := e.AddImage(src, "")
-
- if imagesOnly {
- imageTag, _ := goquery.OuterHtml(s)
- content += strings.Replace(imageTag, src, imagePath, 1)
- } else {
- content = strings.Replace(content, src, imagePath, 1)
+ if c.config.imagesOnly == false {
+ content = c.Content()
+ }
+
+ // parse content
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(c.Content()))
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ // download images and replace src in img tags of content
+ doc.Find("img").Each(func(i int, s *goquery.Selection) {
+ src, _ := s.Attr("src")
+ src = strings.Split(src, "?")[0] // remove query part
+ imagePath, _ := e.AddImage(src, "")
+
+ if c.config.imagesOnly {
+ imageTag, _ := goquery.OuterHtml(s)
+ content += strings.Replace(imageTag, src, imagePath, 1)
+ } else {
+ content = strings.Replace(content, src, imagePath, 1)
+ }
+ })
+
+ html := ""
+ // add title only if imagesOnly = false
+ if c.config.imagesOnly == false {
+ html += fmt.Sprintf("
%s
", c.Name())
+ }
+ html += content
+
+ // write to epub file
+ _, err = e.AddSection(html, c.Name(), "", "")
+ if err != nil {
+ log.Fatal(err)
}
- })
- html := fmt.Sprintf("%s
%s", c.Name(), content)
- _, err = e.AddSection(html, c.Name(), "", "")
- if err != nil {
- log.Fatal(err)
}
for _, sc := range c.SubChapters() {
- AppendToEpub(e, sc, false)
+ AppendToEpub(e, sc)
}
}
func ToMobi(c chapter, filename string) string {
if len(filename) == 0 {
- filename = fmt.Sprintf("%s.mobi", c.Name())
+ filename = fmt.Sprintf("%s.mobi", Filename(c.Name()))
} else {
// add .mobi extension if not specified
@@ -123,12 +134,10 @@ func ToMobi(c chapter, filename string) string {
// log.Fatal(err)
// }
- fmt.Printf("Ebook saved to \"%s\"\n", filename)
-
err := os.Remove(filenameEPUB)
if err != nil {
log.Fatal(err)
}
-
+
return filename
}
diff --git a/book/format_test.go b/book/format_test.go
index 8769bd5..1bb4b85 100644
--- a/book/format_test.go
+++ b/book/format_test.go
@@ -22,12 +22,12 @@ func TestToMarkdown(t *testing.T) {
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
got := ToMarkdown(c)
- want := "Books\n=====\n\n- [Discours de la Méthode](https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/)clock 98 min read -\n1637\n\n- [The Twelve-Factor App](https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/)clock 22 min read -\n2011"
+ want := "Books\n=====\n\n- [Discours de la Méthode](https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/)clock 98 min read -\n1637\n\n- [The Twelve-Factor App](https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/)clock 22 min read -\n2011\n\n\n"
if got != want {
t.Errorf("got %q, wanted %q", got, want)
}
-
+
}
func TestToEpub(t *testing.T) {
@@ -43,7 +43,7 @@ func TestToEpub(t *testing.T) {
t.Errorf("cannot remove %v: %v", filename, err)
}
}
-
+
}
func TestToEpubFilename(t *testing.T) {
@@ -59,7 +59,7 @@ func TestToEpubFilename(t *testing.T) {
t.Errorf("cannot remove %v: %v", filename, err)
}
}
-
+
}
func TestToMobi(t *testing.T) {
@@ -75,5 +75,5 @@ func TestToMobi(t *testing.T) {
t.Errorf("cannot remove %v: %v", filename, err)
}
}
-
+
}
diff --git a/book/scraper.go b/book/scraper.go
index 60fb8c7..753c1ff 100644
--- a/book/scraper.go
+++ b/book/scraper.go
@@ -69,6 +69,13 @@ func NewScrapeConfigsWikipedia() []*ScrapeConfig {
return []*ScrapeConfig{config0, config1}
}
+func NewScrapeConfigFake() *ScrapeConfig {
+ config := NewScrapeConfig()
+ config.include = false
+
+ return config
+}
+
func NewBookFromURL(url, selector, name, author string, recursive, include, imagesOnly bool, limit, offset, delay, threads int) book {
config1 := NewScrapeConfig()
config1.imagesOnly = imagesOnly
@@ -85,7 +92,7 @@ func NewBookFromURL(url, selector, name, author string, recursive, include, imag
config2.threads = threads
config2.include = include
config2.imagesOnly = imagesOnly
- chapters, home = tableOfContent(url, config2)
+ chapters, home = tableOfContent(url, config2, config1)
} else {
chapters = []chapter{NewChapterFromURL(url, []*ScrapeConfig{config1}, 0, func(index int, name string) {})}
home = chapters[0]
@@ -136,7 +143,7 @@ func NewChapterFromURL(url string, configs []*ScrapeConfig, index int, updatePro
}
name := article.Title
- // notify progress bar
+ // notify progress bar with new name
updateProgressBarName(index, name)
subchapters := []chapter{}
@@ -222,6 +229,8 @@ func NewChapterFromURL(url string, configs []*ScrapeConfig, index int, updatePro
content = ""
doc.Find("img").Each(func(i int, s *goquery.Selection) {
imageTag, _ := goquery.OuterHtml(s)
+ imageTag = strings.ReplaceAll(imageTag, "\n", "")
+
content += imageTag
})
@@ -231,7 +240,7 @@ func NewChapterFromURL(url string, configs []*ScrapeConfig, index int, updatePro
return chapter{string(body), name, article.Byline, content, subchapters, config}
}
-func tableOfContent(url string, config *ScrapeConfig) ([]chapter, chapter) {
+func tableOfContent(url string, config *ScrapeConfig, subConfig *ScrapeConfig) ([]chapter, chapter) {
base, err := urllib.Parse(url)
if err != nil {
log.Fatal(err)
@@ -243,7 +252,7 @@ func tableOfContent(url string, config *ScrapeConfig) ([]chapter, chapter) {
}
chapters := make([]chapter, len(links))
- progress := NewProgress(links, "", 0)
+ // progress := NewProgress(links, "", 0)
delay := config.delay
if delay >= 0 {
@@ -256,9 +265,9 @@ func tableOfContent(url string, config *ScrapeConfig) ([]chapter, chapter) {
log.Fatal(err)
}
- sc := NewChapterFromURL(u.String(), []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
+ sc := NewChapterFromURL(u.String(), []*ScrapeConfig{subConfig}, 0, func(index int, name string) {})
chapters[index] = sc
- progress.Increment(index)
+ // progress.Increment(index)
// short sleep for last chapter to let the progress bar update
if index == len(links)-1 {
@@ -292,9 +301,9 @@ func tableOfContent(url string, config *ScrapeConfig) ([]chapter, chapter) {
log.Fatal(err)
}
- sc := NewChapterFromURL(u.String(), []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
+ sc := NewChapterFromURL(u.String(), []*ScrapeConfig{subConfig}, 0, func(index int, name string) {})
chapters[index] = sc
- progress.Increment(index)
+ // progress.Increment(index)
<-semaphore
}(index, l)
diff --git a/book/scraper_test.go b/book/scraper_test.go
index 90adf5a..46e97bd 100644
--- a/book/scraper_test.go
+++ b/book/scraper_test.go
@@ -153,7 +153,7 @@ func TestSubChaptersLimitOver(t *testing.T) {
config0 := NewScrapeConfig()
config0.limit = 3
-
+
config1 := NewScrapeConfig()
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {})
diff --git a/cmd/get.go b/cmd/get.go
index 51630d7..df1c99a 100644
--- a/cmd/get.go
+++ b/cmd/get.go
@@ -5,18 +5,14 @@ import (
"fmt"
"log"
"os"
- "os/exec"
"strings"
- md "github.com/JohannesKaufmann/html-to-markdown"
- "github.com/PuerkitoBio/goquery"
- epub "github.com/bmaupin/go-epub"
"github.com/spf13/cobra"
"github.com/lapwat/papeer/book"
)
-var recursive, include, images bool
+var recursive, include, images, quiet bool
var format, output, selector, name, author string
var limit, offset, delay, threads int
@@ -79,159 +75,46 @@ var getCmd = &cobra.Command{
url := args[0]
b := book.NewBookFromURL(url, selector, name, author, recursive, include, images, limit, offset, delay, threads)
- if len(output) == 0 {
- // set default output
- output = strings.ReplaceAll(b.Name(), " ", "_")
- output = strings.ReplaceAll(output, "/", "")
- output = fmt.Sprintf("%s.%s", output, format)
- }
+ fakeConfig := book.NewScrapeConfigFake()
+ fakeChapter := book.NewChapter("", b.Name(), b.Author(), "", b.Chapters(), fakeConfig)
if format == "stdout" {
-
- for _, c := range b.Chapters() {
- // convert to markdown
- content, err := md.NewConverter("", true, nil).ConvertString(c.Content())
- if err != nil {
- log.Fatal(err)
- }
-
- text := fmt.Sprintf("%s\n%s\n\n%s\n\n\n", c.Name(), strings.Repeat("=", len(c.Name())), content)
-
- // write to stdout
- fmt.Println(text)
- }
-
+ // TODO: ToMarkdownString
+ markdown := book.ToMarkdown(fakeChapter)
+ fmt.Println(markdown)
}
if format == "md" {
+ // TODO: ToMarkdownFile
+ markdown := book.ToMarkdown(fakeChapter)
- // create markdown file
+ if len(output) == 0 {
+ filename := book.Filename(fakeChapter.Name())
+ output = fmt.Sprintf("%s.md", filename)
+ }
+
+ // write to file
f, err := os.Create(output)
if err != nil {
log.Fatal(err)
}
- defer f.Close()
-
- for _, c := range b.Chapters() {
- // convert to markdown
- content, err := md.NewConverter("", true, nil).ConvertString(c.Content())
- if err != nil {
- log.Fatal(err)
- }
-
- text := fmt.Sprintf("%s\n%s\n\n%s\n\n\n", c.Name(), strings.Repeat("=", len(c.Name())), content)
-
- // write to markdown file
- _, err = f.WriteString(text)
- if err != nil {
- log.Fatal(err)
- }
+ _, err2 := f.WriteString(markdown)
+ if err2 != nil {
+ log.Fatal(err2)
}
+ f.Close()
fmt.Printf("Markdown saved to \"%s\"\n", output)
}
if format == "epub" {
- e := epub.NewEpub(b.Name())
- e.SetAuthor(b.Author())
-
- for _, c := range b.Chapters() {
- var content string
-
- if images == false {
- content = c.Content()
- }
-
- // parse content
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(c.Content()))
- if err != nil {
- log.Fatal(err)
- }
-
- // retrieve images and download it
- doc.Find("img").Each(func(i int, s *goquery.Selection) {
- src, _ := s.Attr("src")
- imagePath, _ := e.AddImage(src, "")
-
- if images {
- imageTag, _ := goquery.OuterHtml(s)
- content += imageTag
- }
-
- content = strings.ReplaceAll(content, src, imagePath)
- })
-
- html := fmt.Sprintf("%s
%s", c.Name(), content)
- _, err = e.AddSection(html, c.Name(), "", "")
- if err != nil {
- log.Fatal(err)
- }
- }
-
- err := e.Write(output)
- if err != nil {
- log.Fatal(err)
- }
-
+ output = book.ToEpub(fakeChapter, output)
fmt.Printf("Ebook saved to \"%s\"\n", output)
}
if format == "mobi" {
- e := epub.NewEpub(b.Name())
- e.SetAuthor(b.Author())
-
- for _, c := range b.Chapters() {
- var content string
-
- if images == false {
- content = c.Content()
- }
-
- // parse content
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(c.Content()))
- if err != nil {
- log.Fatal(err)
- }
-
- // retrieve images and download it
- doc.Find("img").Each(func(i int, s *goquery.Selection) {
- src, _ := s.Attr("src")
- imagePath, _ := e.AddImage(src, "")
-
- if images {
- imageTag, _ := goquery.OuterHtml(s)
- content += imageTag
- }
-
- content = strings.ReplaceAll(content, src, imagePath)
- })
-
- html := fmt.Sprintf("%s
%s", c.Name(), content)
- _, err = e.AddSection(html, c.Name(), "", "")
- if err != nil {
- log.Fatal(err)
- }
- }
-
- outputEPUB := strings.ReplaceAll(output, ".mobi", ".epub")
-
- err := e.Write(outputEPUB)
- if err != nil {
- log.Fatal(err)
- }
-
- exec.Command("kindlegen", outputEPUB).Run()
- // exec command always return status 1 even if it succeed
- // if err != nil {
- // log.Fatal(err)
- // }
-
+ output = book.ToMobi(fakeChapter, output)
fmt.Printf("Ebook saved to \"%s\"\n", output)
-
- err = os.Remove(outputEPUB)
- if err != nil {
- log.Fatal(err)
- }
}
},
}
diff --git a/cmd/version.go b/cmd/version.go
index 359b04f..d1b85a6 100644
--- a/cmd/version.go
+++ b/cmd/version.go
@@ -14,6 +14,6 @@ var versionCmd = &cobra.Command{
Use: "version",
Short: "Print the version number of papeer",
Run: func(cmd *cobra.Command, args []string) {
- fmt.Println("papeer v0.3.1")
+ fmt.Println("papeer v0.3.2")
},
}