diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..1574cc5 --- /dev/null +++ b/Makefile @@ -0,0 +1,10 @@ +install: + go install + +format: + gofmt -s -w . + +clean: + find . -maxdepth 1 -not -name 'README.md' -name '*.md' -delete + find . -maxdepth 1 -name '*.epub' -delete + find . -maxdepth 1 -name '*.mobi' -delete diff --git a/README.md b/README.md index 840f7ad..37fc3a5 100644 --- a/README.md +++ b/README.md @@ -19,14 +19,16 @@ Available Commands: version Print the version number of papeer Flags: + -a, --author string book author -d, --delay int time to wait before downloading next chapter, in milliseconds (default -1) -f, --format string file format [stdout, md, epub, mobi] (default "stdout") -h, --help help for papeer --images retrieve images only -i, --include include URL as first chapter, in resursive mode -l, --limit int limit number of chapters, in recursive mode (default -1) + -n, --name string book name (default: page title) -o, --offset int skip first chapters, in recursive mode - --output string output file + --output string file name (default: book name) -r, --recursive create one chapter per natigation item -s, --selector string table of content CSS selector, in resursive mode -t, --threads int download concurrency, in recursive mode (default -1) @@ -134,4 +136,4 @@ You can replace `bash` by your own shell (zsh, fish or powershell). - `html-to-markdown` convert HTML to Markdown - `go-epub` convert HTML to EPUB - `colly` query HTML trees -- `uiprogress` display progress bars \ No newline at end of file +- `uiprogress` display progress bars diff --git a/book/scraper.go b/book/scraper.go index 1931897..5956d1f 100644 --- a/book/scraper.go +++ b/book/scraper.go @@ -14,25 +14,34 @@ import ( colly "github.com/gocolly/colly/v2" ) -func NewBookFromURL(url, selector string, recursive, include, images bool, limit, offset, delay, threads int) book { +func NewBookFromURL(url, selector, name, author string, recursive, include bool, limit, offset, delay, threads int) book { + var chapters []chapter + var home chapter + if recursive { - chapters := tableOfContent(url, selector, limit, offset, delay, threads, include, images) - - b := New(chapters[0].Name(), chapters[0].Author()) - for _, c := range chapters { - b.AddChapter(c) - } - - return b + chapters, home = tableOfContent(url, selector, limit, offset, delay, threads, include) } else { - c := NewChapterFromURL(url, images) - b := New(c.Name(), c.Author()) - b.AddChapter(c) - return b + chapters = []chapter{NewChapterFromURL(url)} + home = chapters[0] } + + if len(name) == 0 { + name = home.Name() + } + + if len(author) == 0 { + author = home.Author() + } + + b := New(name, author) + for _, c := range chapters { + b.AddChapter(c) + } + + return b } -func NewChapterFromURL(url string, images bool) chapter { +func NewChapterFromURL(url string) chapter { article, err := readability.FromURL(url, 30*time.Second) if err != nil { log.Fatalf("failed to parse %s, %v\n", url, err) @@ -40,31 +49,31 @@ func NewChapterFromURL(url string, images bool) chapter { content := strings.ReplaceAll(article.Content, "\n", "") - if images { - // parse html content - doc, err := goquery.NewDocumentFromReader(strings.NewReader(content)) - if err != nil { - log.Fatal(err) - } + // if images { + // // parse html content + // doc, err := goquery.NewDocumentFromReader(strings.NewReader(content)) + // if err != nil { + // log.Fatal(err) + // } - // extract images only - content = "" - doc.Find("img").Each(func(i int, s *goquery.Selection) { - newContent, _ := goquery.OuterHtml(s) - content += newContent - }) - } + // // extract images only + // content = "" + // doc.Find("img").Each(func(i int, s *goquery.Selection) { + // newContent, _ := goquery.OuterHtml(s) + // content += newContent + // }) + // } return chapter{article.Title, article.Byline, content} } -func tableOfContent(url, selector string, limit, offset, delay, threads int, include, images bool) []chapter { +func tableOfContent(url, selector string, limit, offset, delay, threads int, include bool) ([]chapter, chapter) { base, err := urllib.Parse(url) if err != nil { log.Fatal(err) } - links, err := GetLinks(base, selector, limit, offset, include) + links, home, err := GetLinks(base, selector, limit, offset, include) if err != nil { log.Fatal(err) } @@ -82,7 +91,7 @@ func tableOfContent(url, selector string, limit, offset, delay, threads int, inc log.Fatal(err) } - chapters[index] = NewChapterFromURL(u.String(), images) + chapters[index] = NewChapterFromURL(u.String()) progress.Incr(index) // short sleep for last chapter to let the progress bar update @@ -116,7 +125,7 @@ func tableOfContent(url, selector string, limit, offset, delay, threads int, inc log.Fatal(err) } - chapters[index] = NewChapterFromURL(u.String(), images) + chapters[index] = NewChapterFromURL(u.String()) progress.Incr(index) <-semaphore @@ -124,7 +133,8 @@ func tableOfContent(url, selector string, limit, offset, delay, threads int, inc } wg.Wait() } - return chapters + + return chapters, home } func GetPath(elm *goquery.Selection) string { @@ -144,7 +154,7 @@ func GetPath(elm *goquery.Selection) string { return join } -func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool) ([]link, error) { +func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool) ([]link, chapter, error) { selectorSet := true if selector == "" { selector = "a" @@ -182,7 +192,7 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool) links := pathLinks[pathMax] if len(links) == 0 { - return []link{}, fmt.Errorf("no link found for selector: %s", selector) + return []link{}, chapter{}, fmt.Errorf("no link found for selector: %s", selector) } end := len(links) @@ -192,11 +202,12 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool) links = links[offset:end] + home := NewChapterFromURL(url.String()) + if include { - c := NewChapterFromURL(url.String(), false) - l := NewLink(url.String(), c.Name()) + l := NewLink(url.String(), home.Name()) links = append([]link{l}, links...) } - return links, nil + return links, home, nil } diff --git a/cmd/get.go b/cmd/get.go index facee52..66cc70c 100644 --- a/cmd/get.go +++ b/cmd/get.go @@ -17,7 +17,7 @@ import ( ) var recursive, include, images bool -var format, output, selector string +var format, output, selector, name, author string var limit, offset, delay, threads int var getCmd = &cobra.Command{ @@ -77,7 +77,7 @@ var getCmd = &cobra.Command{ }, Run: func(cmd *cobra.Command, args []string) { url := args[0] - b := book.NewBookFromURL(url, selector, recursive, include, images, limit, offset, delay, threads) + b := book.NewBookFromURL(url, selector, name, author, recursive, include, limit, offset, delay, threads) if len(output) == 0 { // set default output @@ -136,27 +136,36 @@ var getCmd = &cobra.Command{ e.SetAuthor(b.Author()) for _, c := range b.Chapters() { - // parse content + var content string + + if images == false { + content = c.Content() + } + + // parse content doc, err := goquery.NewDocumentFromReader(strings.NewReader(c.Content())) if err != nil { log.Fatal(err) } // retrieve images and download it - contentWithLocalImages := c.Content() doc.Find("img").Each(func(i int, s *goquery.Selection) { src, _ := s.Attr("src") imagePath, _ := e.AddImage(src, "") - contentWithLocalImages = strings.ReplaceAll(contentWithLocalImages, src, imagePath) + if images { + imageTag, _ := goquery.OuterHtml(s) + content += imageTag + } + + content = strings.ReplaceAll(content, src, imagePath) }) - html := fmt.Sprintf("

%s

%s", c.Name(), contentWithLocalImages) + html := fmt.Sprintf("

%s

%s", c.Name(), content) _, err = e.AddSection(html, c.Name(), "", "") if err != nil { log.Fatal(err) } - } err := e.Write(output) @@ -171,8 +180,37 @@ var getCmd = &cobra.Command{ e := epub.NewEpub(b.Name()) e.SetAuthor(b.Author()) - for _, chapter := range b.Chapters() { - e.AddSection(chapter.Content(), chapter.Name(), "", "") + for _, c := range b.Chapters() { + var content string + + if images == false { + content = c.Content() + } + + // parse content + doc, err := goquery.NewDocumentFromReader(strings.NewReader(c.Content())) + if err != nil { + log.Fatal(err) + } + + // retrieve images and download it + doc.Find("img").Each(func(i int, s *goquery.Selection) { + src, _ := s.Attr("src") + imagePath, _ := e.AddImage(src, "") + + if images { + imageTag, _ := goquery.OuterHtml(s) + content += imageTag + } + + content = strings.ReplaceAll(content, src, imagePath) + }) + + html := fmt.Sprintf("

%s

%s", c.Name(), content) + _, err = e.AddSection(html, c.Name(), "", "") + if err != nil { + log.Fatal(err) + } } outputEPUB := strings.ReplaceAll(output, ".mobi", ".epub") @@ -183,16 +221,16 @@ var getCmd = &cobra.Command{ } exec.Command("kindlegen", outputEPUB).Run() - // exec command always return status 1 even if it fails + // exec command always return status 1 even if it succeed // if err != nil { // log.Fatal(err) // } fmt.Printf("Ebook saved to \"%s\"\n", output) - err2 := os.Remove(outputEPUB) - if err2 != nil { - log.Fatal(err2) + err = os.Remove(outputEPUB) + if err != nil { + log.Fatal(err) } } }, diff --git a/cmd/list.go b/cmd/list.go index 9baf4dd..4ab8c06 100644 --- a/cmd/list.go +++ b/cmd/list.go @@ -27,7 +27,7 @@ var listCmd = &cobra.Command{ log.Fatal(err) } - links, err := book.GetLinks(base, selector, limit, offset, include) + links, _, err := book.GetLinks(base, selector, limit, offset, include) if err != nil { log.Fatal(err) } diff --git a/cmd/root.go b/cmd/root.go index b92f910..970bcff 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -23,8 +23,10 @@ func Execute() { } func init() { + rootCmd.PersistentFlags().StringVarP(&name, "name", "n", "", "book name (default: page title)") + rootCmd.PersistentFlags().StringVarP(&author, "author", "a", "", "book author") rootCmd.PersistentFlags().StringVarP(&format, "format", "f", "stdout", "file format [stdout, md, epub, mobi]") - rootCmd.PersistentFlags().StringVarP(&output, "output", "", "", "output file") + rootCmd.PersistentFlags().StringVarP(&output, "output", "", "", "file name (default: book name)") rootCmd.PersistentFlags().StringVarP(&selector, "selector", "s", "", "table of content CSS selector, in resursive mode") rootCmd.PersistentFlags().BoolVarP(&recursive, "recursive", "r", false, "create one chapter per natigation item") rootCmd.PersistentFlags().BoolVarP(&include, "include", "i", false, "include URL as first chapter, in resursive mode") diff --git a/cmd/version.go b/cmd/version.go index bbbd88e..359b04f 100644 --- a/cmd/version.go +++ b/cmd/version.go @@ -14,6 +14,6 @@ var versionCmd = &cobra.Command{ Use: "version", Short: "Print the version number of papeer", Run: func(cmd *cobra.Command, args []string) { - fmt.Println("papeer v0.3.0") + fmt.Println("papeer v0.3.1") }, }