diff --git a/README.md b/README.md index 5a0d812..c7db758 100644 --- a/README.md +++ b/README.md @@ -111,14 +111,15 @@ go get -u github.com/lapwat/papeer ```sh platform=linux # use platform=darwin for MacOS -curl -L https://github.com/lapwat/papeer/releases/download/v0.3.1/papeer-v0.3.1-$platform-amd64 > papeer +release=0.3.2 +curl -L https://github.com/lapwat/papeer/releases/download/v$release/papeer-v$release-$platform-amd64 > papeer chmod +x papeer sudo mv papeer /usr/local/bin ``` ### On Windows -Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.3.1/papeer-v0.3.1-windows-amd64.exe). +Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.3.2/papeer-v0.3.2-windows-amd64.exe). ## Install kindlegen to export websites to MOBI (optional) diff --git a/book/format.go b/book/format.go index 7d25094..6a7d576 100644 --- a/book/format.go +++ b/book/format.go @@ -22,89 +22,100 @@ func Filename(name string) string { } func ToMarkdown(c chapter) string { + markdown := "" - // make title - underline := strings.Repeat("=", len(c.Name())) - title := fmt.Sprintf("%s\n%s", c.Name(), underline) + if c.config.include { + // title + markdown += fmt.Sprintf("%s\n", c.Name()) + markdown += fmt.Sprintf("%s\n\n", strings.Repeat("=", len(c.Name()))) - // convert content to markdown - content, err := md.NewConverter("", true, nil).ConvertString(c.Content()) - if err != nil { - log.Fatal(err) + // convert content to markdown + content, err := md.NewConverter("", true, nil).ConvertString(c.Content()) + if err != nil { + log.Fatal(err) + } + markdown += fmt.Sprintf("%s\n\n\n", content) } - // merge title and content - content = fmt.Sprintf("%s\n\n%s", title, content) - for _, sc := range c.SubChapters() { - // merge subchapters - content = fmt.Sprintf("%s\n\n\n%s", content, ToMarkdown(sc)) + // subchapters content + markdown += fmt.Sprintf("%s\n\n\n", ToMarkdown(sc)) } - return content + return markdown } func ToEpub(c chapter, filename string) string { if len(filename) == 0 { - filename = fmt.Sprintf("%s.epub", c.Name()) + filename = fmt.Sprintf("%s.epub", Filename(c.Name())) } // init ebook e := epub.NewEpub(c.Name()) e.SetAuthor(c.Author()) - AppendToEpub(e, c, false) + AppendToEpub(e, c) err := e.Write(filename) if err != nil { log.Fatal(err) } - fmt.Printf("Ebook saved to \"%s\"\n", filename) - return filename } -func AppendToEpub(e *epub.Epub, c chapter, imagesOnly bool) { +func AppendToEpub(e *epub.Epub, c chapter) { content := "" - if imagesOnly == false { - content = c.Content() - } + if c.config.include { - // parse content - doc, err := goquery.NewDocumentFromReader(strings.NewReader(c.Content())) - if err != nil { - log.Fatal(err) - } - - // download images and replace src in img tags of content - doc.Find("img").Each(func(i int, s *goquery.Selection) { - src, _ := s.Attr("src") - imagePath, _ := e.AddImage(src, "") - - if imagesOnly { - imageTag, _ := goquery.OuterHtml(s) - content += strings.Replace(imageTag, src, imagePath, 1) - } else { - content = strings.Replace(content, src, imagePath, 1) + if c.config.imagesOnly == false { + content = c.Content() + } + + // parse content + doc, err := goquery.NewDocumentFromReader(strings.NewReader(c.Content())) + if err != nil { + log.Fatal(err) + } + + // download images and replace src in img tags of content + doc.Find("img").Each(func(i int, s *goquery.Selection) { + src, _ := s.Attr("src") + src = strings.Split(src, "?")[0] // remove query part + imagePath, _ := e.AddImage(src, "") + + if c.config.imagesOnly { + imageTag, _ := goquery.OuterHtml(s) + content += strings.Replace(imageTag, src, imagePath, 1) + } else { + content = strings.Replace(content, src, imagePath, 1) + } + }) + + html := "" + // add title only if imagesOnly = false + if c.config.imagesOnly == false { + html += fmt.Sprintf("

%s

", c.Name()) + } + html += content + + // write to epub file + _, err = e.AddSection(html, c.Name(), "", "") + if err != nil { + log.Fatal(err) } - }) - html := fmt.Sprintf("

%s

%s", c.Name(), content) - _, err = e.AddSection(html, c.Name(), "", "") - if err != nil { - log.Fatal(err) } for _, sc := range c.SubChapters() { - AppendToEpub(e, sc, false) + AppendToEpub(e, sc) } } func ToMobi(c chapter, filename string) string { if len(filename) == 0 { - filename = fmt.Sprintf("%s.mobi", c.Name()) + filename = fmt.Sprintf("%s.mobi", Filename(c.Name())) } else { // add .mobi extension if not specified @@ -123,12 +134,10 @@ func ToMobi(c chapter, filename string) string { // log.Fatal(err) // } - fmt.Printf("Ebook saved to \"%s\"\n", filename) - err := os.Remove(filenameEPUB) if err != nil { log.Fatal(err) } - + return filename } diff --git a/book/format_test.go b/book/format_test.go index 8769bd5..1bb4b85 100644 --- a/book/format_test.go +++ b/book/format_test.go @@ -22,12 +22,12 @@ func TestToMarkdown(t *testing.T) { c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {}) got := ToMarkdown(c) - want := "Books\n=====\n\n- [Discours de la Méthode](https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/)clock 98 min read -\n1637\n\n- [The Twelve-Factor App](https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/)clock 22 min read -\n2011" + want := "Books\n=====\n\n- [Discours de la Méthode](https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/)clock 98 min read -\n1637\n\n- [The Twelve-Factor App](https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/)clock 22 min read -\n2011\n\n\n" if got != want { t.Errorf("got %q, wanted %q", got, want) } - + } func TestToEpub(t *testing.T) { @@ -43,7 +43,7 @@ func TestToEpub(t *testing.T) { t.Errorf("cannot remove %v: %v", filename, err) } } - + } func TestToEpubFilename(t *testing.T) { @@ -59,7 +59,7 @@ func TestToEpubFilename(t *testing.T) { t.Errorf("cannot remove %v: %v", filename, err) } } - + } func TestToMobi(t *testing.T) { @@ -75,5 +75,5 @@ func TestToMobi(t *testing.T) { t.Errorf("cannot remove %v: %v", filename, err) } } - + } diff --git a/book/scraper.go b/book/scraper.go index 60fb8c7..753c1ff 100644 --- a/book/scraper.go +++ b/book/scraper.go @@ -69,6 +69,13 @@ func NewScrapeConfigsWikipedia() []*ScrapeConfig { return []*ScrapeConfig{config0, config1} } +func NewScrapeConfigFake() *ScrapeConfig { + config := NewScrapeConfig() + config.include = false + + return config +} + func NewBookFromURL(url, selector, name, author string, recursive, include, imagesOnly bool, limit, offset, delay, threads int) book { config1 := NewScrapeConfig() config1.imagesOnly = imagesOnly @@ -85,7 +92,7 @@ func NewBookFromURL(url, selector, name, author string, recursive, include, imag config2.threads = threads config2.include = include config2.imagesOnly = imagesOnly - chapters, home = tableOfContent(url, config2) + chapters, home = tableOfContent(url, config2, config1) } else { chapters = []chapter{NewChapterFromURL(url, []*ScrapeConfig{config1}, 0, func(index int, name string) {})} home = chapters[0] @@ -136,7 +143,7 @@ func NewChapterFromURL(url string, configs []*ScrapeConfig, index int, updatePro } name := article.Title - // notify progress bar + // notify progress bar with new name updateProgressBarName(index, name) subchapters := []chapter{} @@ -222,6 +229,8 @@ func NewChapterFromURL(url string, configs []*ScrapeConfig, index int, updatePro content = "" doc.Find("img").Each(func(i int, s *goquery.Selection) { imageTag, _ := goquery.OuterHtml(s) + imageTag = strings.ReplaceAll(imageTag, "\n", "") + content += imageTag }) @@ -231,7 +240,7 @@ func NewChapterFromURL(url string, configs []*ScrapeConfig, index int, updatePro return chapter{string(body), name, article.Byline, content, subchapters, config} } -func tableOfContent(url string, config *ScrapeConfig) ([]chapter, chapter) { +func tableOfContent(url string, config *ScrapeConfig, subConfig *ScrapeConfig) ([]chapter, chapter) { base, err := urllib.Parse(url) if err != nil { log.Fatal(err) @@ -243,7 +252,7 @@ func tableOfContent(url string, config *ScrapeConfig) ([]chapter, chapter) { } chapters := make([]chapter, len(links)) - progress := NewProgress(links, "", 0) + // progress := NewProgress(links, "", 0) delay := config.delay if delay >= 0 { @@ -256,9 +265,9 @@ func tableOfContent(url string, config *ScrapeConfig) ([]chapter, chapter) { log.Fatal(err) } - sc := NewChapterFromURL(u.String(), []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {}) + sc := NewChapterFromURL(u.String(), []*ScrapeConfig{subConfig}, 0, func(index int, name string) {}) chapters[index] = sc - progress.Increment(index) + // progress.Increment(index) // short sleep for last chapter to let the progress bar update if index == len(links)-1 { @@ -292,9 +301,9 @@ func tableOfContent(url string, config *ScrapeConfig) ([]chapter, chapter) { log.Fatal(err) } - sc := NewChapterFromURL(u.String(), []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {}) + sc := NewChapterFromURL(u.String(), []*ScrapeConfig{subConfig}, 0, func(index int, name string) {}) chapters[index] = sc - progress.Increment(index) + // progress.Increment(index) <-semaphore }(index, l) diff --git a/book/scraper_test.go b/book/scraper_test.go index 90adf5a..46e97bd 100644 --- a/book/scraper_test.go +++ b/book/scraper_test.go @@ -153,7 +153,7 @@ func TestSubChaptersLimitOver(t *testing.T) { config0 := NewScrapeConfig() config0.limit = 3 - + config1 := NewScrapeConfig() c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {}) diff --git a/cmd/get.go b/cmd/get.go index 51630d7..df1c99a 100644 --- a/cmd/get.go +++ b/cmd/get.go @@ -5,18 +5,14 @@ import ( "fmt" "log" "os" - "os/exec" "strings" - md "github.com/JohannesKaufmann/html-to-markdown" - "github.com/PuerkitoBio/goquery" - epub "github.com/bmaupin/go-epub" "github.com/spf13/cobra" "github.com/lapwat/papeer/book" ) -var recursive, include, images bool +var recursive, include, images, quiet bool var format, output, selector, name, author string var limit, offset, delay, threads int @@ -79,159 +75,46 @@ var getCmd = &cobra.Command{ url := args[0] b := book.NewBookFromURL(url, selector, name, author, recursive, include, images, limit, offset, delay, threads) - if len(output) == 0 { - // set default output - output = strings.ReplaceAll(b.Name(), " ", "_") - output = strings.ReplaceAll(output, "/", "") - output = fmt.Sprintf("%s.%s", output, format) - } + fakeConfig := book.NewScrapeConfigFake() + fakeChapter := book.NewChapter("", b.Name(), b.Author(), "", b.Chapters(), fakeConfig) if format == "stdout" { - - for _, c := range b.Chapters() { - // convert to markdown - content, err := md.NewConverter("", true, nil).ConvertString(c.Content()) - if err != nil { - log.Fatal(err) - } - - text := fmt.Sprintf("%s\n%s\n\n%s\n\n\n", c.Name(), strings.Repeat("=", len(c.Name())), content) - - // write to stdout - fmt.Println(text) - } - + // TODO: ToMarkdownString + markdown := book.ToMarkdown(fakeChapter) + fmt.Println(markdown) } if format == "md" { + // TODO: ToMarkdownFile + markdown := book.ToMarkdown(fakeChapter) - // create markdown file + if len(output) == 0 { + filename := book.Filename(fakeChapter.Name()) + output = fmt.Sprintf("%s.md", filename) + } + + // write to file f, err := os.Create(output) if err != nil { log.Fatal(err) } - defer f.Close() - - for _, c := range b.Chapters() { - // convert to markdown - content, err := md.NewConverter("", true, nil).ConvertString(c.Content()) - if err != nil { - log.Fatal(err) - } - - text := fmt.Sprintf("%s\n%s\n\n%s\n\n\n", c.Name(), strings.Repeat("=", len(c.Name())), content) - - // write to markdown file - _, err = f.WriteString(text) - if err != nil { - log.Fatal(err) - } + _, err2 := f.WriteString(markdown) + if err2 != nil { + log.Fatal(err2) } + f.Close() fmt.Printf("Markdown saved to \"%s\"\n", output) } if format == "epub" { - e := epub.NewEpub(b.Name()) - e.SetAuthor(b.Author()) - - for _, c := range b.Chapters() { - var content string - - if images == false { - content = c.Content() - } - - // parse content - doc, err := goquery.NewDocumentFromReader(strings.NewReader(c.Content())) - if err != nil { - log.Fatal(err) - } - - // retrieve images and download it - doc.Find("img").Each(func(i int, s *goquery.Selection) { - src, _ := s.Attr("src") - imagePath, _ := e.AddImage(src, "") - - if images { - imageTag, _ := goquery.OuterHtml(s) - content += imageTag - } - - content = strings.ReplaceAll(content, src, imagePath) - }) - - html := fmt.Sprintf("

%s

%s", c.Name(), content) - _, err = e.AddSection(html, c.Name(), "", "") - if err != nil { - log.Fatal(err) - } - } - - err := e.Write(output) - if err != nil { - log.Fatal(err) - } - + output = book.ToEpub(fakeChapter, output) fmt.Printf("Ebook saved to \"%s\"\n", output) } if format == "mobi" { - e := epub.NewEpub(b.Name()) - e.SetAuthor(b.Author()) - - for _, c := range b.Chapters() { - var content string - - if images == false { - content = c.Content() - } - - // parse content - doc, err := goquery.NewDocumentFromReader(strings.NewReader(c.Content())) - if err != nil { - log.Fatal(err) - } - - // retrieve images and download it - doc.Find("img").Each(func(i int, s *goquery.Selection) { - src, _ := s.Attr("src") - imagePath, _ := e.AddImage(src, "") - - if images { - imageTag, _ := goquery.OuterHtml(s) - content += imageTag - } - - content = strings.ReplaceAll(content, src, imagePath) - }) - - html := fmt.Sprintf("

%s

%s", c.Name(), content) - _, err = e.AddSection(html, c.Name(), "", "") - if err != nil { - log.Fatal(err) - } - } - - outputEPUB := strings.ReplaceAll(output, ".mobi", ".epub") - - err := e.Write(outputEPUB) - if err != nil { - log.Fatal(err) - } - - exec.Command("kindlegen", outputEPUB).Run() - // exec command always return status 1 even if it succeed - // if err != nil { - // log.Fatal(err) - // } - + output = book.ToMobi(fakeChapter, output) fmt.Printf("Ebook saved to \"%s\"\n", output) - - err = os.Remove(outputEPUB) - if err != nil { - log.Fatal(err) - } } }, } diff --git a/cmd/version.go b/cmd/version.go index 359b04f..d1b85a6 100644 --- a/cmd/version.go +++ b/cmd/version.go @@ -14,6 +14,6 @@ var versionCmd = &cobra.Command{ Use: "version", Short: "Print the version number of papeer", Run: func(cmd *cobra.Command, args []string) { - fmt.Println("papeer v0.3.1") + fmt.Println("papeer v0.3.2") }, }