mirror of
https://github.com/NohamR/papeer.git
synced 2026-05-25 20:00:47 +00:00
Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0009435769 | ||
|
|
4e9b0611e8 | ||
|
|
e7ffd8c66c | ||
|
|
84e6ad8585 | ||
|
|
d593a74e6e |
10
Makefile
Normal file
10
Makefile
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
install:
|
||||||
|
go install
|
||||||
|
|
||||||
|
format:
|
||||||
|
gofmt -s -w .
|
||||||
|
|
||||||
|
clean:
|
||||||
|
find . -maxdepth 1 -not -name 'README.md' -name '*.md' -delete
|
||||||
|
find . -maxdepth 1 -name '*.epub' -delete
|
||||||
|
find . -maxdepth 1 -name '*.mobi' -delete
|
||||||
10
README.md
10
README.md
@@ -19,14 +19,16 @@ Available Commands:
|
|||||||
version Print the version number of papeer
|
version Print the version number of papeer
|
||||||
|
|
||||||
Flags:
|
Flags:
|
||||||
|
-a, --author string book author
|
||||||
-d, --delay int time to wait before downloading next chapter, in milliseconds (default -1)
|
-d, --delay int time to wait before downloading next chapter, in milliseconds (default -1)
|
||||||
-f, --format string file format [stdout, md, epub, mobi] (default "stdout")
|
-f, --format string file format [stdout, md, epub, mobi] (default "stdout")
|
||||||
-h, --help help for papeer
|
-h, --help help for papeer
|
||||||
--images retrieve images only
|
--images retrieve images only
|
||||||
-i, --include include URL as first chapter, in resursive mode
|
-i, --include include URL as first chapter, in resursive mode
|
||||||
-l, --limit int limit number of chapters, in recursive mode (default -1)
|
-l, --limit int limit number of chapters, in recursive mode (default -1)
|
||||||
|
-n, --name string book name (default: page title)
|
||||||
-o, --offset int skip first chapters, in recursive mode
|
-o, --offset int skip first chapters, in recursive mode
|
||||||
--output string output file
|
--output string file name (default: book name)
|
||||||
-r, --recursive create one chapter per natigation item
|
-r, --recursive create one chapter per natigation item
|
||||||
-s, --selector string table of content CSS selector, in resursive mode
|
-s, --selector string table of content CSS selector, in resursive mode
|
||||||
-t, --threads int download concurrency, in recursive mode (default -1)
|
-t, --threads int download concurrency, in recursive mode (default -1)
|
||||||
@@ -95,14 +97,14 @@ go get -u github.com/lapwat/papeer
|
|||||||
|
|
||||||
```sh
|
```sh
|
||||||
platform=linux # use platform=darwin for MacOS
|
platform=linux # use platform=darwin for MacOS
|
||||||
curl -L https://github.com/lapwat/papeer/releases/download/v0.3.0/papeer-v0.3.0-$platform-amd64 > papeer
|
curl -L https://github.com/lapwat/papeer/releases/download/v0.3.1/papeer-v0.3.1-$platform-amd64 > papeer
|
||||||
chmod +x papeer
|
chmod +x papeer
|
||||||
sudo mv papeer /usr/local/bin
|
sudo mv papeer /usr/local/bin
|
||||||
```
|
```
|
||||||
|
|
||||||
### On Windows
|
### On Windows
|
||||||
|
|
||||||
Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.3.0/papeer-v0.3.0-windows-amd64.exe).
|
Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.3.1/papeer-v0.3.1-windows-amd64.exe).
|
||||||
|
|
||||||
## Install kindlegen to export websites to MOBI (optional)
|
## Install kindlegen to export websites to MOBI (optional)
|
||||||
|
|
||||||
@@ -134,4 +136,4 @@ You can replace `bash` by your own shell (zsh, fish or powershell).
|
|||||||
- `html-to-markdown` convert HTML to Markdown
|
- `html-to-markdown` convert HTML to Markdown
|
||||||
- `go-epub` convert HTML to EPUB
|
- `go-epub` convert HTML to EPUB
|
||||||
- `colly` query HTML trees
|
- `colly` query HTML trees
|
||||||
- `uiprogress` display progress bars
|
- `uiprogress` display progress bars
|
||||||
|
|||||||
@@ -14,25 +14,34 @@ import (
|
|||||||
colly "github.com/gocolly/colly/v2"
|
colly "github.com/gocolly/colly/v2"
|
||||||
)
|
)
|
||||||
|
|
||||||
func NewBookFromURL(url, selector string, recursive, include, images bool, limit, offset, delay, threads int) book {
|
func NewBookFromURL(url, selector, name, author string, recursive, include bool, limit, offset, delay, threads int) book {
|
||||||
|
var chapters []chapter
|
||||||
|
var home chapter
|
||||||
|
|
||||||
if recursive {
|
if recursive {
|
||||||
chapters := tableOfContent(url, selector, limit, offset, delay, threads, include, images)
|
chapters, home = tableOfContent(url, selector, limit, offset, delay, threads, include)
|
||||||
|
|
||||||
b := New(chapters[0].Name(), chapters[0].Author())
|
|
||||||
for _, c := range chapters {
|
|
||||||
b.AddChapter(c)
|
|
||||||
}
|
|
||||||
|
|
||||||
return b
|
|
||||||
} else {
|
} else {
|
||||||
c := NewChapterFromURL(url, images)
|
chapters = []chapter{NewChapterFromURL(url)}
|
||||||
b := New(c.Name(), c.Author())
|
home = chapters[0]
|
||||||
b.AddChapter(c)
|
|
||||||
return b
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if len(name) == 0 {
|
||||||
|
name = home.Name()
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(author) == 0 {
|
||||||
|
author = home.Author()
|
||||||
|
}
|
||||||
|
|
||||||
|
b := New(name, author)
|
||||||
|
for _, c := range chapters {
|
||||||
|
b.AddChapter(c)
|
||||||
|
}
|
||||||
|
|
||||||
|
return b
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewChapterFromURL(url string, images bool) chapter {
|
func NewChapterFromURL(url string) chapter {
|
||||||
article, err := readability.FromURL(url, 30*time.Second)
|
article, err := readability.FromURL(url, 30*time.Second)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatalf("failed to parse %s, %v\n", url, err)
|
log.Fatalf("failed to parse %s, %v\n", url, err)
|
||||||
@@ -40,31 +49,31 @@ func NewChapterFromURL(url string, images bool) chapter {
|
|||||||
|
|
||||||
content := strings.ReplaceAll(article.Content, "\n", "")
|
content := strings.ReplaceAll(article.Content, "\n", "")
|
||||||
|
|
||||||
if images {
|
// if images {
|
||||||
// parse html content
|
// // parse html content
|
||||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
|
// doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
|
||||||
if err != nil {
|
// if err != nil {
|
||||||
log.Fatal(err)
|
// log.Fatal(err)
|
||||||
}
|
// }
|
||||||
|
|
||||||
// extract images only
|
// // extract images only
|
||||||
content = ""
|
// content = ""
|
||||||
doc.Find("img").Each(func(i int, s *goquery.Selection) {
|
// doc.Find("img").Each(func(i int, s *goquery.Selection) {
|
||||||
newContent, _ := goquery.OuterHtml(s)
|
// newContent, _ := goquery.OuterHtml(s)
|
||||||
content += newContent
|
// content += newContent
|
||||||
})
|
// })
|
||||||
}
|
// }
|
||||||
|
|
||||||
return chapter{article.Title, article.Byline, content}
|
return chapter{article.Title, article.Byline, content}
|
||||||
}
|
}
|
||||||
|
|
||||||
func tableOfContent(url, selector string, limit, offset, delay, threads int, include, images bool) []chapter {
|
func tableOfContent(url, selector string, limit, offset, delay, threads int, include bool) ([]chapter, chapter) {
|
||||||
base, err := urllib.Parse(url)
|
base, err := urllib.Parse(url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
links, err := GetLinks(base, selector, limit, offset, include)
|
links, home, err := GetLinks(base, selector, limit, offset, include)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
@@ -82,7 +91,7 @@ func tableOfContent(url, selector string, limit, offset, delay, threads int, inc
|
|||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
chapters[index] = NewChapterFromURL(u.String(), images)
|
chapters[index] = NewChapterFromURL(u.String())
|
||||||
progress.Incr(index)
|
progress.Incr(index)
|
||||||
|
|
||||||
// short sleep for last chapter to let the progress bar update
|
// short sleep for last chapter to let the progress bar update
|
||||||
@@ -116,7 +125,7 @@ func tableOfContent(url, selector string, limit, offset, delay, threads int, inc
|
|||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
chapters[index] = NewChapterFromURL(u.String(), images)
|
chapters[index] = NewChapterFromURL(u.String())
|
||||||
progress.Incr(index)
|
progress.Incr(index)
|
||||||
|
|
||||||
<-semaphore
|
<-semaphore
|
||||||
@@ -124,7 +133,8 @@ func tableOfContent(url, selector string, limit, offset, delay, threads int, inc
|
|||||||
}
|
}
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
}
|
}
|
||||||
return chapters
|
|
||||||
|
return chapters, home
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetPath(elm *goquery.Selection) string {
|
func GetPath(elm *goquery.Selection) string {
|
||||||
@@ -144,7 +154,7 @@ func GetPath(elm *goquery.Selection) string {
|
|||||||
return join
|
return join
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool) ([]link, error) {
|
func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool) ([]link, chapter, error) {
|
||||||
selectorSet := true
|
selectorSet := true
|
||||||
if selector == "" {
|
if selector == "" {
|
||||||
selector = "a"
|
selector = "a"
|
||||||
@@ -182,7 +192,7 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool)
|
|||||||
|
|
||||||
links := pathLinks[pathMax]
|
links := pathLinks[pathMax]
|
||||||
if len(links) == 0 {
|
if len(links) == 0 {
|
||||||
return []link{}, fmt.Errorf("no link found for selector: %s", selector)
|
return []link{}, chapter{}, fmt.Errorf("no link found for selector: %s", selector)
|
||||||
}
|
}
|
||||||
|
|
||||||
end := len(links)
|
end := len(links)
|
||||||
@@ -192,11 +202,12 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool)
|
|||||||
|
|
||||||
links = links[offset:end]
|
links = links[offset:end]
|
||||||
|
|
||||||
|
home := NewChapterFromURL(url.String())
|
||||||
|
|
||||||
if include {
|
if include {
|
||||||
c := NewChapterFromURL(url.String(), false)
|
l := NewLink(url.String(), home.Name())
|
||||||
l := NewLink(url.String(), c.Name())
|
|
||||||
links = append([]link{l}, links...)
|
links = append([]link{l}, links...)
|
||||||
}
|
}
|
||||||
|
|
||||||
return links, nil
|
return links, home, nil
|
||||||
}
|
}
|
||||||
|
|||||||
64
cmd/get.go
64
cmd/get.go
@@ -17,7 +17,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var recursive, include, images bool
|
var recursive, include, images bool
|
||||||
var format, output, selector string
|
var format, output, selector, name, author string
|
||||||
var limit, offset, delay, threads int
|
var limit, offset, delay, threads int
|
||||||
|
|
||||||
var getCmd = &cobra.Command{
|
var getCmd = &cobra.Command{
|
||||||
@@ -77,7 +77,7 @@ var getCmd = &cobra.Command{
|
|||||||
},
|
},
|
||||||
Run: func(cmd *cobra.Command, args []string) {
|
Run: func(cmd *cobra.Command, args []string) {
|
||||||
url := args[0]
|
url := args[0]
|
||||||
b := book.NewBookFromURL(url, selector, recursive, include, images, limit, offset, delay, threads)
|
b := book.NewBookFromURL(url, selector, name, author, recursive, include, limit, offset, delay, threads)
|
||||||
|
|
||||||
if len(output) == 0 {
|
if len(output) == 0 {
|
||||||
// set default output
|
// set default output
|
||||||
@@ -136,27 +136,36 @@ var getCmd = &cobra.Command{
|
|||||||
e.SetAuthor(b.Author())
|
e.SetAuthor(b.Author())
|
||||||
|
|
||||||
for _, c := range b.Chapters() {
|
for _, c := range b.Chapters() {
|
||||||
// parse content
|
var content string
|
||||||
|
|
||||||
|
if images == false {
|
||||||
|
content = c.Content()
|
||||||
|
}
|
||||||
|
|
||||||
|
// parse content
|
||||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(c.Content()))
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(c.Content()))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// retrieve images and download it
|
// retrieve images and download it
|
||||||
contentWithLocalImages := c.Content()
|
|
||||||
doc.Find("img").Each(func(i int, s *goquery.Selection) {
|
doc.Find("img").Each(func(i int, s *goquery.Selection) {
|
||||||
src, _ := s.Attr("src")
|
src, _ := s.Attr("src")
|
||||||
imagePath, _ := e.AddImage(src, "")
|
imagePath, _ := e.AddImage(src, "")
|
||||||
|
|
||||||
contentWithLocalImages = strings.ReplaceAll(contentWithLocalImages, src, imagePath)
|
if images {
|
||||||
|
imageTag, _ := goquery.OuterHtml(s)
|
||||||
|
content += imageTag
|
||||||
|
}
|
||||||
|
|
||||||
|
content = strings.ReplaceAll(content, src, imagePath)
|
||||||
})
|
})
|
||||||
|
|
||||||
html := fmt.Sprintf("<h1>%s</h1>%s", c.Name(), contentWithLocalImages)
|
html := fmt.Sprintf("<h1>%s</h1>%s", c.Name(), content)
|
||||||
_, err = e.AddSection(html, c.Name(), "", "")
|
_, err = e.AddSection(html, c.Name(), "", "")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
err := e.Write(output)
|
err := e.Write(output)
|
||||||
@@ -171,8 +180,37 @@ var getCmd = &cobra.Command{
|
|||||||
e := epub.NewEpub(b.Name())
|
e := epub.NewEpub(b.Name())
|
||||||
e.SetAuthor(b.Author())
|
e.SetAuthor(b.Author())
|
||||||
|
|
||||||
for _, chapter := range b.Chapters() {
|
for _, c := range b.Chapters() {
|
||||||
e.AddSection(chapter.Content(), chapter.Name(), "", "")
|
var content string
|
||||||
|
|
||||||
|
if images == false {
|
||||||
|
content = c.Content()
|
||||||
|
}
|
||||||
|
|
||||||
|
// parse content
|
||||||
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(c.Content()))
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// retrieve images and download it
|
||||||
|
doc.Find("img").Each(func(i int, s *goquery.Selection) {
|
||||||
|
src, _ := s.Attr("src")
|
||||||
|
imagePath, _ := e.AddImage(src, "")
|
||||||
|
|
||||||
|
if images {
|
||||||
|
imageTag, _ := goquery.OuterHtml(s)
|
||||||
|
content += imageTag
|
||||||
|
}
|
||||||
|
|
||||||
|
content = strings.ReplaceAll(content, src, imagePath)
|
||||||
|
})
|
||||||
|
|
||||||
|
html := fmt.Sprintf("<h1>%s</h1>%s", c.Name(), content)
|
||||||
|
_, err = e.AddSection(html, c.Name(), "", "")
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
outputEPUB := strings.ReplaceAll(output, ".mobi", ".epub")
|
outputEPUB := strings.ReplaceAll(output, ".mobi", ".epub")
|
||||||
@@ -183,16 +221,16 @@ var getCmd = &cobra.Command{
|
|||||||
}
|
}
|
||||||
|
|
||||||
exec.Command("kindlegen", outputEPUB).Run()
|
exec.Command("kindlegen", outputEPUB).Run()
|
||||||
// exec command always return status 1 even if it fails
|
// exec command always return status 1 even if it succeed
|
||||||
// if err != nil {
|
// if err != nil {
|
||||||
// log.Fatal(err)
|
// log.Fatal(err)
|
||||||
// }
|
// }
|
||||||
|
|
||||||
fmt.Printf("Ebook saved to \"%s\"\n", output)
|
fmt.Printf("Ebook saved to \"%s\"\n", output)
|
||||||
|
|
||||||
err2 := os.Remove(outputEPUB)
|
err = os.Remove(outputEPUB)
|
||||||
if err2 != nil {
|
if err != nil {
|
||||||
log.Fatal(err2)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ var listCmd = &cobra.Command{
|
|||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
links, err := book.GetLinks(base, selector, limit, offset, include)
|
links, _, err := book.GetLinks(base, selector, limit, offset, include)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -23,8 +23,10 @@ func Execute() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
|
rootCmd.PersistentFlags().StringVarP(&name, "name", "n", "", "book name (default: page title)")
|
||||||
|
rootCmd.PersistentFlags().StringVarP(&author, "author", "a", "", "book author")
|
||||||
rootCmd.PersistentFlags().StringVarP(&format, "format", "f", "stdout", "file format [stdout, md, epub, mobi]")
|
rootCmd.PersistentFlags().StringVarP(&format, "format", "f", "stdout", "file format [stdout, md, epub, mobi]")
|
||||||
rootCmd.PersistentFlags().StringVarP(&output, "output", "", "", "output file")
|
rootCmd.PersistentFlags().StringVarP(&output, "output", "", "", "file name (default: book name)")
|
||||||
rootCmd.PersistentFlags().StringVarP(&selector, "selector", "s", "", "table of content CSS selector, in resursive mode")
|
rootCmd.PersistentFlags().StringVarP(&selector, "selector", "s", "", "table of content CSS selector, in resursive mode")
|
||||||
rootCmd.PersistentFlags().BoolVarP(&recursive, "recursive", "r", false, "create one chapter per natigation item")
|
rootCmd.PersistentFlags().BoolVarP(&recursive, "recursive", "r", false, "create one chapter per natigation item")
|
||||||
rootCmd.PersistentFlags().BoolVarP(&include, "include", "i", false, "include URL as first chapter, in resursive mode")
|
rootCmd.PersistentFlags().BoolVarP(&include, "include", "i", false, "include URL as first chapter, in resursive mode")
|
||||||
|
|||||||
@@ -14,6 +14,6 @@ var versionCmd = &cobra.Command{
|
|||||||
Use: "version",
|
Use: "version",
|
||||||
Short: "Print the version number of papeer",
|
Short: "Print the version number of papeer",
|
||||||
Run: func(cmd *cobra.Command, args []string) {
|
Run: func(cmd *cobra.Command, args []string) {
|
||||||
fmt.Println("papeer v0.3.0")
|
fmt.Println("papeer v0.3.1")
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user