3 Commits

Author SHA1 Message Date
lapwat
403fdcc0f0 [get] print url option 2024-08-14 23:32:33 +02:00
lapwat
1b2be1c390 update tests 2024-08-14 14:32:00 +02:00
lapwat
4521497d12 add pub date to chapter skeleton 2023-10-01 22:07:38 +02:00
8 changed files with 154 additions and 93 deletions

View File

@@ -1,6 +1,7 @@
package book package book
type chapter struct { type chapter struct {
url string
body string body string
name string name string
author string author string
@@ -10,11 +11,11 @@ type chapter struct {
} }
func NewEmptyChapter() chapter { func NewEmptyChapter() chapter {
return chapter{"", "", "", "", []chapter{}, NewScrapeConfigNoInclude()} return chapter{"", "", "", "", "", []chapter{}, NewScrapeConfigNoInclude()}
} }
func NewChapter(body, name, author, content string, subChapters []chapter, config *ScrapeConfig) chapter { func (c chapter) URL() string {
return chapter{body, name, author, content, subChapters, config} return c.url
} }
func (c chapter) Body() string { func (c chapter) Body() string {

View File

@@ -30,6 +30,11 @@ func ToMarkdownString(c chapter) string {
markdown += fmt.Sprintf("%s\n", c.Name()) markdown += fmt.Sprintf("%s\n", c.Name())
markdown += fmt.Sprintf("%s\n\n", strings.Repeat("=", len(c.Name()))) markdown += fmt.Sprintf("%s\n\n", strings.Repeat("=", len(c.Name())))
// url
if c.config.PrintURL {
markdown += fmt.Sprintf("_Source: %s_\n\n", c.URL())
}
// convert content to markdown // convert content to markdown
content, err := md.NewConverter("", true, nil).ConvertString(c.Content()) content, err := md.NewConverter("", true, nil).ConvertString(c.Content())
if err != nil { if err != nil {
@@ -72,7 +77,15 @@ func ToHtmlString(c chapter) string {
// chapter content // chapter content
if c.config.Include { if c.config.Include {
html += fmt.Sprintf("<h1>%s</h1>", c.Name()) // title
html += fmt.Sprintf("<h1>%s</h1>\n", c.Name())
// url
if c.config.PrintURL {
html += fmt.Sprintf("<p><i>Source: %s</i></p>\n", c.URL())
}
// content
html += c.Content() html += c.Content()
} }
@@ -114,19 +127,6 @@ func ToEpub(c chapter, filename string) string {
e := epub.NewEpub(c.Name()) e := epub.NewEpub(c.Name())
e.SetAuthor(c.Author()) e.SetAuthor(c.Author())
AppendToEpub(e, c)
err := e.Write(filename)
if err != nil {
log.Fatal(err)
}
return filename
}
func AppendToEpub(e *epub.Epub, c chapter) {
content := ""
// append table of content // append table of content
if len(c.SubChapters()) > 1 { if len(c.SubChapters()) > 1 {
html := "<h1>Table of Contents</h1>" html := "<h1>Table of Contents</h1>"
@@ -143,6 +143,19 @@ func AppendToEpub(e *epub.Epub, c chapter) {
} }
} }
AppendToEpub(e, c)
err := e.Write(filename)
if err != nil {
log.Fatal(err)
}
return filename
}
func AppendToEpub(e *epub.Epub, c chapter) {
content := ""
// chapter content // chapter content
if c.config.Include { if c.config.Include {
@@ -173,8 +186,15 @@ func AppendToEpub(e *epub.Epub, c chapter) {
html := "" html := ""
// add title only if ImagesOnly = false // add title only if ImagesOnly = false
if c.config.ImagesOnly == false { if c.config.ImagesOnly == false {
html += fmt.Sprintf("<h1>%s</h1>", c.Name()) html += fmt.Sprintf("<h1>%s</h1>\n", c.Name())
} }
// url
if c.config.PrintURL {
html += fmt.Sprintf("<p><i>Source: %s</i></p>\n", c.URL())
}
// content
html += content html += content
// write to epub file // write to epub file

File diff suppressed because one or more lines are too long

View File

@@ -1,10 +1,13 @@
package book package book
import "time"
type link struct { type link struct {
Href string `json:"url"` Href string `json:"url"`
Text string `json:"name"` Text string `json:"name"`
Date *time.Time `json:"date"`
} }
func NewLink(href, text string) link { func NewLink(href, text string, date *time.Time) link {
return link{href, text} return link{href, text, date}
} }

View File

@@ -30,14 +30,19 @@ type ScrapeConfig struct {
Include bool Include bool
ImagesOnly bool ImagesOnly bool
UseLinkName bool UseLinkName bool
PrintURL bool
} }
func NewScrapeConfig() *ScrapeConfig { func NewScrapeConfig() *ScrapeConfig {
return &ScrapeConfig{0, "", false, -1, 0, false, -1, -1, true, false, false} return &ScrapeConfig{0, "", false, -1, 0, false, -1, -1, true, false, false, false}
}
func NewScrapeConfigQuiet() *ScrapeConfig {
return &ScrapeConfig{0, "", true, -1, 0, false, -1, -1, true, false, false, false}
} }
func NewScrapeConfigNoInclude() *ScrapeConfig { func NewScrapeConfigNoInclude() *ScrapeConfig {
return &ScrapeConfig{0, "", false, -1, 0, false, -1, -1, false, false, false} return &ScrapeConfig{0, "", false, -1, 0, false, -1, -1, false, false, false, false}
} }
func NewScrapeConfigs(selectors []string) []*ScrapeConfig { func NewScrapeConfigs(selectors []string) []*ScrapeConfig {
@@ -252,7 +257,7 @@ func NewChapterFromURL(url, linkName string, configs []*ScrapeConfig, index int,
} }
return chapter{string(body), name, article.Byline, content, subchapters, config} return chapter{url, string(body), name, article.Byline, content, subchapters, config}
} }
func tableOfContent(url string, config *ScrapeConfig, subConfig *ScrapeConfig, quiet bool) ([]chapter, chapter) { func tableOfContent(url string, config *ScrapeConfig, subConfig *ScrapeConfig, quiet bool) ([]chapter, chapter) {
@@ -370,7 +375,7 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, reverse, incl
log.Fatal(err) log.Fatal(err)
} }
links = append(links, NewLink(u.String(), item.Title)) links = append(links, NewLink(u.String(), item.Title, item.PublishedParsed))
} }
pathMax = "RSS" pathMax = "RSS"
@@ -405,7 +410,7 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, reverse, incl
// if selector is set, we use the selector specified by the user // if selector is set, we use the selector specified by the user
key = selector key = selector
pathLinks[key] = append(pathLinks[key], NewLink(href, text)) pathLinks[key] = append(pathLinks[key], NewLink(href, text, &time.Time{}))
pathCount[key] += 1 pathCount[key] += 1
pathMax = key pathMax = key
@@ -419,7 +424,7 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, reverse, incl
// we count this key if the link text is not empty // we count this key if the link text is not empty
if text != "" { if text != "" {
pathLinks[key] = append(pathLinks[key], NewLink(href, text)) pathLinks[key] = append(pathLinks[key], NewLink(href, text, &time.Time{}))
pathCount[key] += len(text) pathCount[key] += len(text)
if pathCount[key] > pathCount[pathMax] { if pathCount[key] > pathCount[pathMax] {
@@ -449,7 +454,7 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, reverse, incl
// include home page // include home page
if include { if include {
l := NewLink(url.String(), home.Name()) l := NewLink(url.String(), home.Name(), &time.Time{})
links = append([]link{l}, links...) links = append([]link{l}, links...)
} }

File diff suppressed because one or more lines are too long

View File

@@ -33,6 +33,7 @@ type GetOptions struct {
threads int threads int
include bool include bool
useLinkName bool useLinkName bool
printURL bool
} }
var getOpts *GetOptions var getOpts *GetOptions
@@ -46,6 +47,7 @@ func init() {
getCmd.Flags().StringVarP(&getOpts.output, "output", "", "", "file name (default: book name)") getCmd.Flags().StringVarP(&getOpts.output, "output", "", "", "file name (default: book name)")
getCmd.Flags().BoolVarP(&getOpts.stdout, "stdout", "", false, "print to standard output") getCmd.Flags().BoolVarP(&getOpts.stdout, "stdout", "", false, "print to standard output")
getCmd.Flags().BoolVarP(&getOpts.images, "images", "", false, "retrieve images only") getCmd.Flags().BoolVarP(&getOpts.images, "images", "", false, "retrieve images only")
getCmd.Flags().BoolVarP(&getOpts.printURL, "print-url", "", false, "print url after chapter title")
getCmd.Flags().BoolVarP(&getOpts.quiet, "quiet", "q", false, "hide progress bar") getCmd.Flags().BoolVarP(&getOpts.quiet, "quiet", "q", false, "hide progress bar")
// common with list command // common with list command
@@ -147,6 +149,7 @@ var getCmd = &cobra.Command{
config.ImagesOnly = getOpts.images config.ImagesOnly = getOpts.images
config.Include = getOpts.include config.Include = getOpts.include
config.UseLinkName = getOpts.useLinkName config.UseLinkName = getOpts.useLinkName
config.PrintURL = getOpts.printURL
// do not use link name for root level as there is not parent link // do not use link name for root level as there is not parent link
if index == 0 { if index == 0 {

View File

@@ -14,6 +14,6 @@ var versionCmd = &cobra.Command{
Use: "version", Use: "version",
Short: "Print the version number of papeer", Short: "Print the version number of papeer",
Run: func(cmd *cobra.Command, args []string) { Run: func(cmd *cobra.Command, args []string) {
fmt.Println("papeer v0.8.0") fmt.Println("papeer v0.8.2")
}, },
} }