mirror of
https://github.com/NohamR/papeer.git
synced 2026-05-25 20:00:47 +00:00
Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
32168718c9 | ||
|
|
403fdcc0f0 | ||
|
|
1b2be1c390 | ||
|
|
4521497d12 | ||
|
|
2cbcf17cc2 |
2
.github/workflows/release.yml
vendored
2
.github/workflows/release.yml
vendored
@@ -15,7 +15,7 @@ jobs:
|
|||||||
goos: windows
|
goos: windows
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
- uses: wangyoucao577/go-release-action@v1.30
|
- uses: wangyoucao577/go-release-action@v1.40
|
||||||
with:
|
with:
|
||||||
github_token: ${{ secrets.GITHUB_TOKEN }}
|
github_token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
goos: ${{ matrix.goos }}
|
goos: ${{ matrix.goos }}
|
||||||
|
|||||||
@@ -40,7 +40,9 @@ Download [latest release](https://github.com/lapwat/papeer/releases/latest) for
|
|||||||
|
|
||||||
## MOBI support
|
## MOBI support
|
||||||
|
|
||||||
Install kindlegen to convert websites, Linux only.
|
> Kindle e-readers now support EPUB format
|
||||||
|
|
||||||
|
Install kindlegen to export websites to Kindle compatible ebooks, Linux only.
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
TMPDIR=$(mktemp -d -t papeer-XXXXX)
|
TMPDIR=$(mktemp -d -t papeer-XXXXX)
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package book
|
package book
|
||||||
|
|
||||||
type chapter struct {
|
type chapter struct {
|
||||||
|
url string
|
||||||
body string
|
body string
|
||||||
name string
|
name string
|
||||||
author string
|
author string
|
||||||
@@ -10,11 +11,11 @@ type chapter struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func NewEmptyChapter() chapter {
|
func NewEmptyChapter() chapter {
|
||||||
return chapter{"", "", "", "", []chapter{}, NewScrapeConfigNoInclude()}
|
return chapter{"", "", "", "", "", []chapter{}, NewScrapeConfigNoInclude()}
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewChapter(body, name, author, content string, subChapters []chapter, config *ScrapeConfig) chapter {
|
func (c chapter) URL() string {
|
||||||
return chapter{body, name, author, content, subChapters, config}
|
return c.url
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c chapter) Body() string {
|
func (c chapter) Body() string {
|
||||||
|
|||||||
@@ -30,6 +30,11 @@ func ToMarkdownString(c chapter) string {
|
|||||||
markdown += fmt.Sprintf("%s\n", c.Name())
|
markdown += fmt.Sprintf("%s\n", c.Name())
|
||||||
markdown += fmt.Sprintf("%s\n\n", strings.Repeat("=", len(c.Name())))
|
markdown += fmt.Sprintf("%s\n\n", strings.Repeat("=", len(c.Name())))
|
||||||
|
|
||||||
|
// url
|
||||||
|
if c.config.PrintURL {
|
||||||
|
markdown += fmt.Sprintf("_%s_\n\n", c.URL())
|
||||||
|
}
|
||||||
|
|
||||||
// convert content to markdown
|
// convert content to markdown
|
||||||
content, err := md.NewConverter("", true, nil).ConvertString(c.Content())
|
content, err := md.NewConverter("", true, nil).ConvertString(c.Content())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -72,7 +77,15 @@ func ToHtmlString(c chapter) string {
|
|||||||
|
|
||||||
// chapter content
|
// chapter content
|
||||||
if c.config.Include {
|
if c.config.Include {
|
||||||
html += fmt.Sprintf("<h1>%s</h1>", c.Name())
|
// title
|
||||||
|
html += fmt.Sprintf("<h1>%s</h1>\n", c.Name())
|
||||||
|
|
||||||
|
// url
|
||||||
|
if c.config.PrintURL {
|
||||||
|
html += fmt.Sprintf("<p><i>%s</i></p>\n", c.URL())
|
||||||
|
}
|
||||||
|
|
||||||
|
// content
|
||||||
html += c.Content()
|
html += c.Content()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -114,19 +127,6 @@ func ToEpub(c chapter, filename string) string {
|
|||||||
e := epub.NewEpub(c.Name())
|
e := epub.NewEpub(c.Name())
|
||||||
e.SetAuthor(c.Author())
|
e.SetAuthor(c.Author())
|
||||||
|
|
||||||
AppendToEpub(e, c)
|
|
||||||
|
|
||||||
err := e.Write(filename)
|
|
||||||
if err != nil {
|
|
||||||
log.Fatal(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
return filename
|
|
||||||
}
|
|
||||||
|
|
||||||
func AppendToEpub(e *epub.Epub, c chapter) {
|
|
||||||
content := ""
|
|
||||||
|
|
||||||
// append table of content
|
// append table of content
|
||||||
if len(c.SubChapters()) > 1 {
|
if len(c.SubChapters()) > 1 {
|
||||||
html := "<h1>Table of Contents</h1>"
|
html := "<h1>Table of Contents</h1>"
|
||||||
@@ -143,6 +143,19 @@ func AppendToEpub(e *epub.Epub, c chapter) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
AppendToEpub(e, c)
|
||||||
|
|
||||||
|
err := e.Write(filename)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return filename
|
||||||
|
}
|
||||||
|
|
||||||
|
func AppendToEpub(e *epub.Epub, c chapter) {
|
||||||
|
content := ""
|
||||||
|
|
||||||
// chapter content
|
// chapter content
|
||||||
if c.config.Include {
|
if c.config.Include {
|
||||||
|
|
||||||
@@ -164,17 +177,24 @@ func AppendToEpub(e *epub.Epub, c chapter) {
|
|||||||
|
|
||||||
if c.config.ImagesOnly {
|
if c.config.ImagesOnly {
|
||||||
imageTag, _ := goquery.OuterHtml(s)
|
imageTag, _ := goquery.OuterHtml(s)
|
||||||
content += strings.Replace(imageTag, src, imagePath, 1)
|
content += strings.ReplaceAll(imageTag, src, imagePath)
|
||||||
} else {
|
} else {
|
||||||
content = strings.Replace(content, src, imagePath, 1)
|
content = strings.ReplaceAll(content, src, imagePath)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
html := ""
|
html := ""
|
||||||
// add title only if ImagesOnly = false
|
// add title only if ImagesOnly = false
|
||||||
if c.config.ImagesOnly == false {
|
if c.config.ImagesOnly == false {
|
||||||
html += fmt.Sprintf("<h1>%s</h1>", c.Name())
|
html += fmt.Sprintf("<h1>%s</h1>\n", c.Name())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// url
|
||||||
|
if c.config.PrintURL {
|
||||||
|
html += fmt.Sprintf("<p><i>%s</i></p>\n", c.URL())
|
||||||
|
}
|
||||||
|
|
||||||
|
// content
|
||||||
html += content
|
html += content
|
||||||
|
|
||||||
// write to epub file
|
// write to epub file
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
11
book/link.go
11
book/link.go
@@ -1,10 +1,13 @@
|
|||||||
package book
|
package book
|
||||||
|
|
||||||
|
import "time"
|
||||||
|
|
||||||
type link struct {
|
type link struct {
|
||||||
Href string `json:"url"`
|
Href string `json:"url"`
|
||||||
Text string `json:"name"`
|
Text string `json:"name"`
|
||||||
|
Date *time.Time `json:"date"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewLink(href, text string) link {
|
func NewLink(href, text string, date *time.Time) link {
|
||||||
return link{href, text}
|
return link{href, text, date}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -30,14 +30,19 @@ type ScrapeConfig struct {
|
|||||||
Include bool
|
Include bool
|
||||||
ImagesOnly bool
|
ImagesOnly bool
|
||||||
UseLinkName bool
|
UseLinkName bool
|
||||||
|
PrintURL bool
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewScrapeConfig() *ScrapeConfig {
|
func NewScrapeConfig() *ScrapeConfig {
|
||||||
return &ScrapeConfig{0, "", false, -1, 0, false, -1, -1, true, false, false}
|
return &ScrapeConfig{0, "", false, -1, 0, false, -1, -1, true, false, false, false}
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewScrapeConfigQuiet() *ScrapeConfig {
|
||||||
|
return &ScrapeConfig{0, "", true, -1, 0, false, -1, -1, true, false, false, false}
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewScrapeConfigNoInclude() *ScrapeConfig {
|
func NewScrapeConfigNoInclude() *ScrapeConfig {
|
||||||
return &ScrapeConfig{0, "", false, -1, 0, false, -1, -1, false, false, false}
|
return &ScrapeConfig{0, "", false, -1, 0, false, -1, -1, false, false, false, false}
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewScrapeConfigs(selectors []string) []*ScrapeConfig {
|
func NewScrapeConfigs(selectors []string) []*ScrapeConfig {
|
||||||
@@ -237,7 +242,6 @@ func NewChapterFromURL(url, linkName string, configs []*ScrapeConfig, index int,
|
|||||||
content = ""
|
content = ""
|
||||||
doc.Find("img").Each(func(i int, s *goquery.Selection) {
|
doc.Find("img").Each(func(i int, s *goquery.Selection) {
|
||||||
imageTag, _ := goquery.OuterHtml(s)
|
imageTag, _ := goquery.OuterHtml(s)
|
||||||
// imageTag = strings.ReplaceAll(imageTag, "\n", "")
|
|
||||||
content += imageTag
|
content += imageTag
|
||||||
})
|
})
|
||||||
|
|
||||||
@@ -252,7 +256,7 @@ func NewChapterFromURL(url, linkName string, configs []*ScrapeConfig, index int,
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return chapter{string(body), name, article.Byline, content, subchapters, config}
|
return chapter{url, string(body), name, article.Byline, content, subchapters, config}
|
||||||
}
|
}
|
||||||
|
|
||||||
func tableOfContent(url string, config *ScrapeConfig, subConfig *ScrapeConfig, quiet bool) ([]chapter, chapter) {
|
func tableOfContent(url string, config *ScrapeConfig, subConfig *ScrapeConfig, quiet bool) ([]chapter, chapter) {
|
||||||
@@ -370,7 +374,7 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, reverse, incl
|
|||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
links = append(links, NewLink(u.String(), item.Title))
|
links = append(links, NewLink(u.String(), item.Title, item.PublishedParsed))
|
||||||
}
|
}
|
||||||
|
|
||||||
pathMax = "RSS"
|
pathMax = "RSS"
|
||||||
@@ -405,7 +409,7 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, reverse, incl
|
|||||||
// if selector is set, we use the selector specified by the user
|
// if selector is set, we use the selector specified by the user
|
||||||
|
|
||||||
key = selector
|
key = selector
|
||||||
pathLinks[key] = append(pathLinks[key], NewLink(href, text))
|
pathLinks[key] = append(pathLinks[key], NewLink(href, text, &time.Time{}))
|
||||||
pathCount[key] += 1
|
pathCount[key] += 1
|
||||||
pathMax = key
|
pathMax = key
|
||||||
|
|
||||||
@@ -419,7 +423,7 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, reverse, incl
|
|||||||
|
|
||||||
// we count this key if the link text is not empty
|
// we count this key if the link text is not empty
|
||||||
if text != "" {
|
if text != "" {
|
||||||
pathLinks[key] = append(pathLinks[key], NewLink(href, text))
|
pathLinks[key] = append(pathLinks[key], NewLink(href, text, &time.Time{}))
|
||||||
pathCount[key] += len(text)
|
pathCount[key] += len(text)
|
||||||
|
|
||||||
if pathCount[key] > pathCount[pathMax] {
|
if pathCount[key] > pathCount[pathMax] {
|
||||||
@@ -449,7 +453,7 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, reverse, incl
|
|||||||
|
|
||||||
// include home page
|
// include home page
|
||||||
if include {
|
if include {
|
||||||
l := NewLink(url.String(), home.Name())
|
l := NewLink(url.String(), home.Name(), &time.Time{})
|
||||||
links = append([]link{l}, links...)
|
links = append([]link{l}, links...)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
27
cmd/get.go
27
cmd/get.go
@@ -1,6 +1,7 @@
|
|||||||
package cmd
|
package cmd
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
@@ -32,6 +33,7 @@ type GetOptions struct {
|
|||||||
threads int
|
threads int
|
||||||
include bool
|
include bool
|
||||||
useLinkName bool
|
useLinkName bool
|
||||||
|
printURL bool
|
||||||
}
|
}
|
||||||
|
|
||||||
var getOpts *GetOptions
|
var getOpts *GetOptions
|
||||||
@@ -41,10 +43,11 @@ func init() {
|
|||||||
|
|
||||||
getCmd.Flags().StringVarP(&getOpts.name, "name", "n", "", "book name (default: page title)")
|
getCmd.Flags().StringVarP(&getOpts.name, "name", "n", "", "book name (default: page title)")
|
||||||
getCmd.Flags().StringVarP(&getOpts.author, "author", "a", "", "book author")
|
getCmd.Flags().StringVarP(&getOpts.author, "author", "a", "", "book author")
|
||||||
getCmd.Flags().StringVarP(&getOpts.Format, "format", "f", "md", "file format [md, html, epub, mobi]")
|
getCmd.Flags().StringVarP(&getOpts.Format, "format", "f", "md", "file format [md, json, html, epub, mobi]")
|
||||||
getCmd.Flags().StringVarP(&getOpts.output, "output", "", "", "file name (default: book name)")
|
getCmd.Flags().StringVarP(&getOpts.output, "output", "", "", "file name (default: book name)")
|
||||||
getCmd.Flags().BoolVarP(&getOpts.stdout, "stdout", "", false, "print to standard output")
|
getCmd.Flags().BoolVarP(&getOpts.stdout, "stdout", "", false, "print to standard output")
|
||||||
getCmd.Flags().BoolVarP(&getOpts.images, "images", "", false, "retrieve images only")
|
getCmd.Flags().BoolVarP(&getOpts.images, "images", "", false, "retrieve images only")
|
||||||
|
getCmd.Flags().BoolVarP(&getOpts.printURL, "print-url", "", false, "print url after chapter title")
|
||||||
getCmd.Flags().BoolVarP(&getOpts.quiet, "quiet", "q", false, "hide progress bar")
|
getCmd.Flags().BoolVarP(&getOpts.quiet, "quiet", "q", false, "hide progress bar")
|
||||||
|
|
||||||
// common with list command
|
// common with list command
|
||||||
@@ -73,6 +76,7 @@ var getCmd = &cobra.Command{
|
|||||||
// check provided format is in list
|
// check provided format is in list
|
||||||
formatEnum := map[string]bool{
|
formatEnum := map[string]bool{
|
||||||
"md": true,
|
"md": true,
|
||||||
|
"json": true,
|
||||||
"html": true,
|
"html": true,
|
||||||
"epub": true,
|
"epub": true,
|
||||||
"mobi": true,
|
"mobi": true,
|
||||||
@@ -145,6 +149,7 @@ var getCmd = &cobra.Command{
|
|||||||
config.ImagesOnly = getOpts.images
|
config.ImagesOnly = getOpts.images
|
||||||
config.Include = getOpts.include
|
config.Include = getOpts.include
|
||||||
config.UseLinkName = getOpts.useLinkName
|
config.UseLinkName = getOpts.useLinkName
|
||||||
|
config.PrintURL = getOpts.printURL
|
||||||
|
|
||||||
// do not use link name for root level as there is not parent link
|
// do not use link name for root level as there is not parent link
|
||||||
if index == 0 {
|
if index == 0 {
|
||||||
@@ -182,6 +187,26 @@ var getCmd = &cobra.Command{
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if getOpts.Format == "json" {
|
||||||
|
filename := book.ToMarkdown(c, getOpts.output)
|
||||||
|
|
||||||
|
bytesRead, err := ioutil.ReadFile(filename)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
book := make(map[string]interface{})
|
||||||
|
book["name"] = c.Name()
|
||||||
|
book["content"] = string(bytesRead)
|
||||||
|
|
||||||
|
bookJson, err := json.Marshal(book)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Println(string(bookJson))
|
||||||
|
}
|
||||||
|
|
||||||
if getOpts.Format == "html" {
|
if getOpts.Format == "html" {
|
||||||
filename := book.ToHtml(c, getOpts.output)
|
filename := book.ToHtml(c, getOpts.output)
|
||||||
|
|
||||||
|
|||||||
@@ -14,6 +14,6 @@ var versionCmd = &cobra.Command{
|
|||||||
Use: "version",
|
Use: "version",
|
||||||
Short: "Print the version number of papeer",
|
Short: "Print the version number of papeer",
|
||||||
Run: func(cmd *cobra.Command, args []string) {
|
Run: func(cmd *cobra.Command, args []string) {
|
||||||
fmt.Println("papeer v0.7.1")
|
fmt.Println("papeer v0.8.3")
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user