mirror of
https://github.com/NohamR/papeer.git
synced 2026-05-24 20:00:45 +00:00
chain selctors, depth & quiet options, split main commands
This commit is contained in:
168
cmd/get.go
168
cmd/get.go
@@ -3,8 +3,6 @@ package cmd
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/spf13/cobra"
|
||||
@@ -12,13 +10,57 @@ import (
|
||||
"github.com/lapwat/papeer/book"
|
||||
)
|
||||
|
||||
var recursive, include, images, quiet bool
|
||||
var format, output, selector, name, author string
|
||||
var limit, offset, delay, threads int
|
||||
type GetOptions struct {
|
||||
// url string
|
||||
|
||||
name string
|
||||
author string
|
||||
Format string
|
||||
output string
|
||||
images bool
|
||||
// ImagesOnly bool
|
||||
quiet bool
|
||||
|
||||
Selector []string
|
||||
depth int
|
||||
limit int
|
||||
offset int
|
||||
delay int
|
||||
threads int
|
||||
// includeUrl bool
|
||||
include bool
|
||||
useLinkName bool
|
||||
}
|
||||
|
||||
var getOpts *GetOptions
|
||||
|
||||
func init() {
|
||||
getOpts = &GetOptions{}
|
||||
|
||||
getCmd.PersistentFlags().StringVarP(&getOpts.name, "name", "n", "", "book name (default: page title)")
|
||||
getCmd.PersistentFlags().StringVarP(&getOpts.author, "author", "a", "", "book author")
|
||||
getCmd.PersistentFlags().StringVarP(&getOpts.Format, "format", "f", "md", "file format [stdout, md, epub, mobi]")
|
||||
getCmd.PersistentFlags().StringVarP(&getOpts.output, "output", "", "", "file name (default: book name)")
|
||||
getCmd.PersistentFlags().BoolVarP(&getOpts.images, "images", "", false, "retrieve images only")
|
||||
getCmd.PersistentFlags().BoolVarP(&getOpts.quiet, "quiet", "q", false, "hide progress bar")
|
||||
|
||||
// common with list command
|
||||
getCmd.Flags().StringSliceVarP(&getOpts.Selector, "selector", "s", []string{}, "table of contents CSS selector")
|
||||
getCmd.Flags().IntVarP(&getOpts.depth, "depth", "d", 0, "scraping depth")
|
||||
getCmd.Flags().IntVarP(&getOpts.limit, "limit", "l", -1, "limit number of chapters, use with depth/selector")
|
||||
getCmd.Flags().IntVarP(&getOpts.offset, "offset", "o", 0, "skip first chapters, use with depth/selector")
|
||||
getCmd.Flags().IntVarP(&getOpts.delay, "delay", "", -1, "time in milliseconds to wait before downloading next chapter, use with depth/selector")
|
||||
getCmd.Flags().IntVarP(&getOpts.threads, "threads", "t", -1, "download concurrency, use with depth/selector")
|
||||
getCmd.Flags().BoolVarP(&getOpts.include, "include", "i", false, "include URL as first chapter, use with depth/selector")
|
||||
getCmd.Flags().BoolVarP(&getOpts.useLinkName, "use-link-name", "", false, "use link name for chapter title")
|
||||
|
||||
rootCmd.AddCommand(getCmd)
|
||||
}
|
||||
|
||||
var getCmd = &cobra.Command{
|
||||
Use: "get",
|
||||
Short: "Scrape URL content",
|
||||
Use: "get URL",
|
||||
Short: "Scrape URL content",
|
||||
Example: "papeer get https://www.eff.org/cyberspace-independence",
|
||||
Args: func(cmd *cobra.Command, args []string) error {
|
||||
if len(args) < 1 {
|
||||
return errors.New("requires an URL argument")
|
||||
@@ -30,39 +72,36 @@ var getCmd = &cobra.Command{
|
||||
"epub": true,
|
||||
"mobi": true,
|
||||
}
|
||||
if formatEnum[format] != true {
|
||||
return fmt.Errorf("invalid format specified: %s", format)
|
||||
|
||||
if formatEnum[getOpts.Format] != true {
|
||||
return fmt.Errorf("invalid format specified: %s", getOpts.Format)
|
||||
}
|
||||
|
||||
// add .mobi to filename if not specified
|
||||
if format == "mobi" {
|
||||
if len(output) > 0 && strings.HasSuffix(output, ".mobi") == false {
|
||||
output = fmt.Sprintf("%s.mobi", output)
|
||||
if getOpts.Format == "mobi" {
|
||||
if len(getOpts.output) > 0 && strings.HasSuffix(getOpts.output, ".mobi") == false {
|
||||
getOpts.output = fmt.Sprintf("%s.mobi", getOpts.output)
|
||||
}
|
||||
}
|
||||
|
||||
if cmd.Flags().Changed("selector") && recursive == false {
|
||||
return errors.New("cannot use selector option if not in recursive mode")
|
||||
if cmd.Flags().Changed("include") && getOpts.depth == 0 && len(getOpts.Selector) == 0 {
|
||||
return errors.New("cannot use include option if depth/selector is not specified")
|
||||
}
|
||||
|
||||
if cmd.Flags().Changed("include") && recursive == false {
|
||||
return errors.New("cannot use include option if not in recursive mode")
|
||||
if cmd.Flags().Changed("limit") && getOpts.depth == 0 && len(getOpts.Selector) == 0 {
|
||||
return errors.New("cannot use limit option if depth/selector is not specified")
|
||||
}
|
||||
|
||||
if cmd.Flags().Changed("limit") && recursive == false {
|
||||
return errors.New("cannot use limit option if not in recursive mode")
|
||||
if cmd.Flags().Changed("offset") && getOpts.depth == 0 && len(getOpts.Selector) == 0 {
|
||||
return errors.New("cannot use offset option if depth/selector is not specified")
|
||||
}
|
||||
|
||||
if cmd.Flags().Changed("offset") && recursive == false {
|
||||
return errors.New("cannot use offset option if not in recursive mode")
|
||||
if cmd.Flags().Changed("delay") && getOpts.depth == 0 && len(getOpts.Selector) == 0 {
|
||||
return errors.New("cannot use delay option if depth/selector is not specified")
|
||||
}
|
||||
|
||||
if cmd.Flags().Changed("delay") && recursive == false {
|
||||
return errors.New("cannot use delay option if not in recursive mode")
|
||||
}
|
||||
|
||||
if cmd.Flags().Changed("threads") && recursive == false {
|
||||
return errors.New("cannot use threads option if not in recursive mode")
|
||||
if cmd.Flags().Changed("threads") && getOpts.depth == 0 && len(getOpts.Selector) == 0 {
|
||||
return errors.New("cannot use threads option if depth/selector is not specified")
|
||||
}
|
||||
|
||||
if cmd.Flags().Changed("delay") && cmd.Flags().Changed("threads") {
|
||||
@@ -73,48 +112,59 @@ var getCmd = &cobra.Command{
|
||||
},
|
||||
Run: func(cmd *cobra.Command, args []string) {
|
||||
url := args[0]
|
||||
b := book.NewBookFromURL(url, selector, name, author, recursive, include, images, quiet, limit, offset, delay, threads)
|
||||
|
||||
fakeConfig := book.NewScrapeConfigFake()
|
||||
fakeChapter := book.NewChapter("", b.Name(), b.Author(), "", b.Chapters(), fakeConfig)
|
||||
// fill selector array with empty selectors to match depth
|
||||
for len(getOpts.Selector) < getOpts.depth+2 {
|
||||
getOpts.Selector = append(getOpts.Selector, "")
|
||||
}
|
||||
|
||||
if format == "stdout" {
|
||||
// TODO: ToMarkdownString
|
||||
markdown := book.ToMarkdown(fakeChapter)
|
||||
// generate config for each level
|
||||
configs := make([]*book.ScrapeConfig, len(getOpts.Selector))
|
||||
for index, s := range getOpts.Selector {
|
||||
config := book.NewScrapeConfig()
|
||||
config.Selector = s
|
||||
config.Quiet = getOpts.quiet
|
||||
config.Limit = getOpts.limit
|
||||
config.Offset = getOpts.offset
|
||||
config.Delay = getOpts.delay
|
||||
config.Threads = getOpts.threads
|
||||
config.ImagesOnly = getOpts.images
|
||||
config.Include = getOpts.include
|
||||
config.UseLinkName = getOpts.useLinkName
|
||||
|
||||
// do not use link name for root level as there is not parent link
|
||||
if index == 0 {
|
||||
config.UseLinkName = false
|
||||
}
|
||||
|
||||
// always include last level by default
|
||||
if index == len(getOpts.Selector)-1 {
|
||||
config.Include = true
|
||||
}
|
||||
|
||||
configs[index] = config
|
||||
}
|
||||
|
||||
c := book.NewChapterFromURL(url, "", configs, 0, func(index int, name string) {})
|
||||
|
||||
if getOpts.Format == "stdout" {
|
||||
markdown := book.ToMarkdownString(c)
|
||||
fmt.Println(markdown)
|
||||
}
|
||||
|
||||
if format == "md" {
|
||||
// TODO: ToMarkdownFile
|
||||
markdown := book.ToMarkdown(fakeChapter)
|
||||
|
||||
if len(output) == 0 {
|
||||
filename := book.Filename(fakeChapter.Name())
|
||||
output = fmt.Sprintf("%s.md", filename)
|
||||
}
|
||||
|
||||
// write to file
|
||||
f, err := os.Create(output)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
_, err2 := f.WriteString(markdown)
|
||||
if err2 != nil {
|
||||
log.Fatal(err2)
|
||||
}
|
||||
f.Close()
|
||||
|
||||
fmt.Printf("Markdown saved to \"%s\"\n", output)
|
||||
if getOpts.Format == "md" {
|
||||
filename := book.ToMarkdown(c, getOpts.output)
|
||||
fmt.Printf("Markdown saved to \"%s\"\n", filename)
|
||||
}
|
||||
|
||||
if format == "epub" {
|
||||
output = book.ToEpub(fakeChapter, output)
|
||||
fmt.Printf("Ebook saved to \"%s\"\n", output)
|
||||
if getOpts.Format == "epub" {
|
||||
filename := book.ToEpub(c, getOpts.output)
|
||||
fmt.Printf("Ebook saved to \"%s\"\n", filename)
|
||||
}
|
||||
|
||||
if format == "mobi" {
|
||||
output = book.ToMobi(fakeChapter, output)
|
||||
fmt.Printf("Ebook saved to \"%s\"\n", output)
|
||||
if getOpts.Format == "mobi" {
|
||||
filename := book.ToMobi(c, getOpts.output)
|
||||
fmt.Printf("Ebook saved to \"%s\"\n", filename)
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
56
cmd/list.go
56
cmd/list.go
@@ -2,9 +2,11 @@ package cmd
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
urllib "net/url"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/jedib0t/go-pretty/v6/table"
|
||||
cobra "github.com/spf13/cobra"
|
||||
@@ -12,9 +14,42 @@ import (
|
||||
"github.com/lapwat/papeer/book"
|
||||
)
|
||||
|
||||
type ListOptions struct {
|
||||
// url string
|
||||
|
||||
Selector []string
|
||||
depth int
|
||||
limit int
|
||||
offset int
|
||||
delay int
|
||||
threads int
|
||||
// includeUrl bool
|
||||
include bool
|
||||
useLinkName bool
|
||||
}
|
||||
|
||||
var listOpts *ListOptions
|
||||
|
||||
func init() {
|
||||
listOpts = &ListOptions{}
|
||||
|
||||
listCmd.Flags().StringSliceVarP(&listOpts.Selector, "selector", "s", []string{}, "table of contents CSS selector")
|
||||
listCmd.Flags().IntVarP(&listOpts.depth, "depth", "d", 0, "scraping depth")
|
||||
listCmd.Flags().IntVarP(&listOpts.limit, "limit", "l", -1, "limit number of chapters, use with depth/selector")
|
||||
listCmd.Flags().IntVarP(&listOpts.offset, "offset", "o", 0, "skip first chapters, use with depth/selector")
|
||||
listCmd.Flags().IntVarP(&listOpts.delay, "delay", "", -1, "time in milliseconds to wait before downloading next chapter, use with depth/selector")
|
||||
listCmd.Flags().IntVarP(&listOpts.threads, "threads", "t", -1, "download concurrency, use with depth/selector")
|
||||
listCmd.Flags().BoolVarP(&listOpts.include, "include", "i", false, "include URL as first chapter, use with depth/selector")
|
||||
listCmd.Flags().BoolVarP(&listOpts.useLinkName, "use-link-name", "", false, "use link name for chapter title")
|
||||
|
||||
rootCmd.AddCommand(listCmd)
|
||||
}
|
||||
|
||||
var listCmd = &cobra.Command{
|
||||
Use: "ls",
|
||||
Short: "Print table of content",
|
||||
Use: "list URL",
|
||||
Aliases: []string{"ls"},
|
||||
Short: "Print URL table of contents",
|
||||
Example: "papeer list https://12factor.net/ -s 'section.concrete>article>h2>a'",
|
||||
Args: func(cmd *cobra.Command, args []string) error {
|
||||
if len(args) < 1 {
|
||||
return errors.New("requires an URL argument")
|
||||
@@ -22,12 +57,16 @@ var listCmd = &cobra.Command{
|
||||
return nil
|
||||
},
|
||||
Run: func(cmd *cobra.Command, args []string) {
|
||||
if len(listOpts.Selector) == 0 {
|
||||
listOpts.Selector = []string{""}
|
||||
}
|
||||
|
||||
base, err := urllib.Parse(args[0])
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
links, _, err := book.GetLinks(base, selector, limit, offset, include)
|
||||
links, path, _, err := book.GetLinks(base, listOpts.Selector[0], listOpts.limit, listOpts.offset, listOpts.include)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
@@ -37,7 +76,16 @@ var listCmd = &cobra.Command{
|
||||
t.Style().Options.DrawBorder = false
|
||||
t.Style().Options.SeparateColumns = false
|
||||
t.Style().Options.SeparateHeader = false
|
||||
t.AppendHeader(table.Row{"#", "Name", "Url"})
|
||||
|
||||
// format selector path
|
||||
pathArray := strings.Split(path, "<")
|
||||
// reverse path
|
||||
for i, j := 0, len(pathArray)-1; i < j; i, j = i+1, j-1 {
|
||||
pathArray[i], pathArray[j] = pathArray[j], pathArray[i]
|
||||
}
|
||||
pathFormatted := strings.Join(pathArray, ">")
|
||||
|
||||
t.AppendHeader(table.Row{"#", "Name", fmt.Sprintf("Url [%s]", pathFormatted)})
|
||||
|
||||
for index, link := range links {
|
||||
u, err := base.Parse(link.Href())
|
||||
|
||||
19
cmd/root.go
19
cmd/root.go
@@ -21,22 +21,3 @@ func Execute() {
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
func init() {
|
||||
rootCmd.PersistentFlags().StringVarP(&name, "name", "n", "", "book name (default: page title)")
|
||||
rootCmd.PersistentFlags().StringVarP(&author, "author", "a", "", "book author")
|
||||
rootCmd.PersistentFlags().StringVarP(&format, "format", "f", "stdout", "file format [stdout, md, epub, mobi]")
|
||||
rootCmd.PersistentFlags().StringVarP(&output, "output", "", "", "file name (default: book name)")
|
||||
rootCmd.PersistentFlags().StringVarP(&selector, "selector", "s", "", "table of content CSS selector, in resursive mode")
|
||||
rootCmd.PersistentFlags().BoolVarP(&recursive, "recursive", "r", false, "create one chapter per natigation item")
|
||||
rootCmd.PersistentFlags().BoolVarP(&include, "include", "i", false, "include URL as first chapter, in resursive mode")
|
||||
rootCmd.PersistentFlags().BoolVarP(&images, "images", "", false, "retrieve images only")
|
||||
rootCmd.PersistentFlags().BoolVarP(&quiet, "quiet", "q", false, "hide progress bar")
|
||||
rootCmd.PersistentFlags().IntVarP(&limit, "limit", "l", -1, "limit number of chapters, in recursive mode")
|
||||
rootCmd.PersistentFlags().IntVarP(&offset, "offset", "o", 0, "skip first chapters, in recursive mode")
|
||||
rootCmd.PersistentFlags().IntVarP(&delay, "delay", "d", -1, "time to wait before downloading next chapter, in milliseconds")
|
||||
rootCmd.PersistentFlags().IntVarP(&threads, "threads", "t", -1, "download concurrency, in recursive mode")
|
||||
|
||||
rootCmd.AddCommand(getCmd)
|
||||
rootCmd.AddCommand(listCmd)
|
||||
}
|
||||
|
||||
@@ -14,6 +14,6 @@ var versionCmd = &cobra.Command{
|
||||
Use: "version",
|
||||
Short: "Print the version number of papeer",
|
||||
Run: func(cmd *cobra.Command, args []string) {
|
||||
fmt.Println("papeer v0.3.3")
|
||||
fmt.Println("papeer v0.4.0")
|
||||
},
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user