mirror of
https://github.com/NohamR/papeer.git
synced 2026-05-25 12:27:20 +00:00
Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
be69854b17 |
@@ -48,6 +48,7 @@ Flags:
|
|||||||
-o, --offset int skip first chapters, use with depth/selector
|
-o, --offset int skip first chapters, use with depth/selector
|
||||||
--output string file name (default: book name)
|
--output string file name (default: book name)
|
||||||
-q, --quiet hide progress bar
|
-q, --quiet hide progress bar
|
||||||
|
-r, --reverse reverse chapter order
|
||||||
-s, --selector strings table of contents CSS selector
|
-s, --selector strings table of contents CSS selector
|
||||||
-t, --threads int download concurrency, use with depth/selector (default -1)
|
-t, --threads int download concurrency, use with depth/selector (default -1)
|
||||||
--use-link-name use link name for chapter title
|
--use-link-name use link name for chapter title
|
||||||
@@ -77,7 +78,7 @@ You can chain this option to grab several level of pages with diferent selectors
|
|||||||
|
|
||||||
### Display the table of contents
|
### Display the table of contents
|
||||||
|
|
||||||
Before actually scraping a whole website, it is a good idea to use the `list` command. This command is like a **dry run**, which lets you vizualize the content before actually retrieving it. You can use several options to customize the table of contents extraction, such as `selector`, `limit`, `offset` and `include`. Type `papeer list --help` for more information about those options.
|
Before actually scraping a whole website, it is a good idea to use the `list` command. This command is like a **dry run**, which lets you vizualize the content before actually retrieving it. You can use several options to customize the table of contents extraction, such as `selector`, `limit`, `offset`, `reverse` and `include`. Type `papeer list --help` for more information about those options.
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
papeer list https://12factor.net/ -s 'section.concrete>article>h2>a'
|
papeer list https://12factor.net/ -s 'section.concrete>article>h2>a'
|
||||||
@@ -137,7 +138,7 @@ go get -u github.com/lapwat/papeer
|
|||||||
```sh
|
```sh
|
||||||
# use platform=darwin for MacOS
|
# use platform=darwin for MacOS
|
||||||
platform=linux
|
platform=linux
|
||||||
release=0.4.1
|
release=0.4.2
|
||||||
|
|
||||||
# download and extract
|
# download and extract
|
||||||
curl -L https://github.com/lapwat/papeer/releases/download/v$release/papeer-v$release-$platform-amd64.tar.gz > papeer.tar.gz
|
curl -L https://github.com/lapwat/papeer/releases/download/v$release/papeer-v$release-$platform-amd64.tar.gz > papeer.tar.gz
|
||||||
@@ -150,7 +151,7 @@ sudo mv papeer /usr/local/bin
|
|||||||
|
|
||||||
### Windows
|
### Windows
|
||||||
|
|
||||||
Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.4.1/papeer-v0.4.1-windows-amd64.exe.zip).
|
Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.4.2/papeer-v0.4.2-windows-amd64.exe.zip).
|
||||||
|
|
||||||
## MOBI support
|
## MOBI support
|
||||||
|
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ type ScrapeConfig struct {
|
|||||||
Quiet bool
|
Quiet bool
|
||||||
Limit int
|
Limit int
|
||||||
Offset int
|
Offset int
|
||||||
|
Reverse bool
|
||||||
Delay int
|
Delay int
|
||||||
Threads int
|
Threads int
|
||||||
Include bool
|
Include bool
|
||||||
@@ -31,7 +32,7 @@ type ScrapeConfig struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func NewScrapeConfig() *ScrapeConfig {
|
func NewScrapeConfig() *ScrapeConfig {
|
||||||
return &ScrapeConfig{0, "", false, -1, 0, -1, -1, true, false, false}
|
return &ScrapeConfig{0, "", false, -1, 0, false, -1, -1, true, false, false}
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewScrapeConfigs(selectors []string) []*ScrapeConfig {
|
func NewScrapeConfigs(selectors []string) []*ScrapeConfig {
|
||||||
@@ -167,21 +168,24 @@ func NewChapterFromURL(url, linkName string, configs []*ScrapeConfig, index int,
|
|||||||
updateProgressBarName(index, name)
|
updateProgressBarName(index, name)
|
||||||
}
|
}
|
||||||
|
|
||||||
subchapters := []chapter{}
|
var subchapters []chapter
|
||||||
if len(configs) > 1 {
|
if len(configs) > 1 {
|
||||||
// add subchapters
|
|
||||||
|
|
||||||
links, _, _, err := GetLinks(base, config.Selector, config.Limit, config.Offset, false)
|
// retrieve links on page
|
||||||
|
links, _, _, err := GetLinks(base, config.Selector, config.Limit, config.Offset, config.Reverse, false)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
subchapters = make([]chapter, len(links))
|
// init progess bar
|
||||||
var p progress
|
var p progress
|
||||||
if config.Quiet == false {
|
if config.Quiet == false {
|
||||||
p = NewProgress(links, name, config.Depth)
|
p = NewProgress(links, name, config.Depth)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// init chapters list
|
||||||
|
subchapters = make([]chapter, len(links))
|
||||||
|
|
||||||
if config.Delay >= 0 {
|
if config.Delay >= 0 {
|
||||||
|
|
||||||
// synchronous mode
|
// synchronous mode
|
||||||
@@ -277,7 +281,7 @@ func tableOfContent(url string, config *ScrapeConfig, subConfig *ScrapeConfig, q
|
|||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
links, _, home, err := GetLinks(base, config.Selector, config.Limit, config.Offset, config.Include)
|
links, _, home, err := GetLinks(base, config.Selector, config.Limit, config.Offset, config.Reverse, config.Include)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
@@ -370,7 +374,7 @@ func GetPath(elm *goquery.Selection) string {
|
|||||||
return join
|
return join
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool) ([]link, string, chapter, error) {
|
func GetLinks(url *urllib.URL, selector string, limit, offset int, reverse, include bool) ([]link, string, chapter, error) {
|
||||||
selectorSet := true
|
selectorSet := true
|
||||||
if len(selector) == 0 {
|
if len(selector) == 0 {
|
||||||
selector = "a"
|
selector = "a"
|
||||||
@@ -434,10 +438,18 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool)
|
|||||||
|
|
||||||
home := NewChapterFromURL(url.String(), "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
|
home := NewChapterFromURL(url.String(), "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
|
||||||
|
|
||||||
|
// include home page
|
||||||
if include {
|
if include {
|
||||||
l := NewLink(url.String(), home.Name())
|
l := NewLink(url.String(), home.Name())
|
||||||
links = append([]link{l}, links...)
|
links = append([]link{l}, links...)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// reverse links
|
||||||
|
if reverse {
|
||||||
|
for i, j := 0, len(links)-1; i < j; i, j = i+1, j-1 {
|
||||||
|
links[i], links[j] = links[j], links[i]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return links, pathMax, home, nil
|
return links, pathMax, home, nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -182,6 +182,24 @@ func TestSubChaptersLimitOver(t *testing.T) {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestReverse(t *testing.T) {
|
||||||
|
|
||||||
|
config0 := NewScrapeConfig()
|
||||||
|
config0.Reverse = true
|
||||||
|
|
||||||
|
config1 := NewScrapeConfig()
|
||||||
|
|
||||||
|
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {})
|
||||||
|
|
||||||
|
got := c.SubChapters()[0].Name()
|
||||||
|
want := "The Twelve-Factor App"
|
||||||
|
|
||||||
|
if got != want {
|
||||||
|
t.Errorf("got %v, wanted %v", got, want)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
func TestNotInclude(t *testing.T) {
|
func TestNotInclude(t *testing.T) {
|
||||||
|
|
||||||
config := NewScrapeConfig()
|
config := NewScrapeConfig()
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ type GetOptions struct {
|
|||||||
depth int
|
depth int
|
||||||
limit int
|
limit int
|
||||||
offset int
|
offset int
|
||||||
|
reverse bool
|
||||||
delay int
|
delay int
|
||||||
threads int
|
threads int
|
||||||
// includeUrl bool
|
// includeUrl bool
|
||||||
@@ -49,6 +50,7 @@ func init() {
|
|||||||
getCmd.Flags().IntVarP(&getOpts.depth, "depth", "d", 0, "scraping depth")
|
getCmd.Flags().IntVarP(&getOpts.depth, "depth", "d", 0, "scraping depth")
|
||||||
getCmd.Flags().IntVarP(&getOpts.limit, "limit", "l", -1, "limit number of chapters, use with depth/selector")
|
getCmd.Flags().IntVarP(&getOpts.limit, "limit", "l", -1, "limit number of chapters, use with depth/selector")
|
||||||
getCmd.Flags().IntVarP(&getOpts.offset, "offset", "o", 0, "skip first chapters, use with depth/selector")
|
getCmd.Flags().IntVarP(&getOpts.offset, "offset", "o", 0, "skip first chapters, use with depth/selector")
|
||||||
|
getCmd.Flags().BoolVarP(&getOpts.reverse, "reverse", "r", false, "reverse chapter order")
|
||||||
getCmd.Flags().IntVarP(&getOpts.delay, "delay", "", -1, "time in milliseconds to wait before downloading next chapter, use with depth/selector")
|
getCmd.Flags().IntVarP(&getOpts.delay, "delay", "", -1, "time in milliseconds to wait before downloading next chapter, use with depth/selector")
|
||||||
getCmd.Flags().IntVarP(&getOpts.threads, "threads", "t", -1, "download concurrency, use with depth/selector")
|
getCmd.Flags().IntVarP(&getOpts.threads, "threads", "t", -1, "download concurrency, use with depth/selector")
|
||||||
getCmd.Flags().BoolVarP(&getOpts.include, "include", "i", false, "include URL as first chapter, use with depth/selector")
|
getCmd.Flags().BoolVarP(&getOpts.include, "include", "i", false, "include URL as first chapter, use with depth/selector")
|
||||||
@@ -96,6 +98,10 @@ var getCmd = &cobra.Command{
|
|||||||
return errors.New("cannot use offset option if depth/selector is not specified")
|
return errors.New("cannot use offset option if depth/selector is not specified")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if cmd.Flags().Changed("reverse") && getOpts.depth == 0 && len(getOpts.Selector) == 0 {
|
||||||
|
return errors.New("cannot use reverse option if depth/selector is not specified")
|
||||||
|
}
|
||||||
|
|
||||||
if cmd.Flags().Changed("delay") && getOpts.depth == 0 && len(getOpts.Selector) == 0 {
|
if cmd.Flags().Changed("delay") && getOpts.depth == 0 && len(getOpts.Selector) == 0 {
|
||||||
return errors.New("cannot use delay option if depth/selector is not specified")
|
return errors.New("cannot use delay option if depth/selector is not specified")
|
||||||
}
|
}
|
||||||
@@ -122,7 +128,6 @@ var getCmd = &cobra.Command{
|
|||||||
for len(getOpts.Selector) < getOpts.depth+1 {
|
for len(getOpts.Selector) < getOpts.depth+1 {
|
||||||
getOpts.Selector = append(getOpts.Selector, "")
|
getOpts.Selector = append(getOpts.Selector, "")
|
||||||
}
|
}
|
||||||
fmt.Println(len(getOpts.Selector))
|
|
||||||
|
|
||||||
// generate config for each level
|
// generate config for each level
|
||||||
configs := make([]*book.ScrapeConfig, len(getOpts.Selector))
|
configs := make([]*book.ScrapeConfig, len(getOpts.Selector))
|
||||||
@@ -132,6 +137,7 @@ var getCmd = &cobra.Command{
|
|||||||
config.Quiet = getOpts.quiet
|
config.Quiet = getOpts.quiet
|
||||||
config.Limit = getOpts.limit
|
config.Limit = getOpts.limit
|
||||||
config.Offset = getOpts.offset
|
config.Offset = getOpts.offset
|
||||||
|
config.Reverse = getOpts.reverse
|
||||||
config.Delay = getOpts.delay
|
config.Delay = getOpts.delay
|
||||||
config.Threads = getOpts.threads
|
config.Threads = getOpts.threads
|
||||||
config.ImagesOnly = getOpts.images
|
config.ImagesOnly = getOpts.images
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ type ListOptions struct {
|
|||||||
depth int
|
depth int
|
||||||
limit int
|
limit int
|
||||||
offset int
|
offset int
|
||||||
|
reverse bool
|
||||||
delay int
|
delay int
|
||||||
threads int
|
threads int
|
||||||
// includeUrl bool
|
// includeUrl bool
|
||||||
@@ -33,10 +34,12 @@ var listOpts *ListOptions
|
|||||||
func init() {
|
func init() {
|
||||||
listOpts = &ListOptions{}
|
listOpts = &ListOptions{}
|
||||||
|
|
||||||
|
// common with get command
|
||||||
listCmd.Flags().StringSliceVarP(&listOpts.Selector, "selector", "s", []string{}, "table of contents CSS selector")
|
listCmd.Flags().StringSliceVarP(&listOpts.Selector, "selector", "s", []string{}, "table of contents CSS selector")
|
||||||
listCmd.Flags().IntVarP(&listOpts.depth, "depth", "d", 0, "scraping depth")
|
listCmd.Flags().IntVarP(&listOpts.depth, "depth", "d", 0, "scraping depth")
|
||||||
listCmd.Flags().IntVarP(&listOpts.limit, "limit", "l", -1, "limit number of chapters, use with depth/selector")
|
listCmd.Flags().IntVarP(&listOpts.limit, "limit", "l", -1, "limit number of chapters, use with depth/selector")
|
||||||
listCmd.Flags().IntVarP(&listOpts.offset, "offset", "o", 0, "skip first chapters, use with depth/selector")
|
listCmd.Flags().IntVarP(&listOpts.offset, "offset", "o", 0, "skip first chapters, use with depth/selector")
|
||||||
|
listCmd.Flags().BoolVarP(&listOpts.reverse, "reverse", "r", false, "reverse chapter order")
|
||||||
listCmd.Flags().IntVarP(&listOpts.delay, "delay", "", -1, "time in milliseconds to wait before downloading next chapter, use with depth/selector")
|
listCmd.Flags().IntVarP(&listOpts.delay, "delay", "", -1, "time in milliseconds to wait before downloading next chapter, use with depth/selector")
|
||||||
listCmd.Flags().IntVarP(&listOpts.threads, "threads", "t", -1, "download concurrency, use with depth/selector")
|
listCmd.Flags().IntVarP(&listOpts.threads, "threads", "t", -1, "download concurrency, use with depth/selector")
|
||||||
listCmd.Flags().BoolVarP(&listOpts.include, "include", "i", false, "include URL as first chapter, use with depth/selector")
|
listCmd.Flags().BoolVarP(&listOpts.include, "include", "i", false, "include URL as first chapter, use with depth/selector")
|
||||||
@@ -66,7 +69,7 @@ var listCmd = &cobra.Command{
|
|||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
links, path, _, err := book.GetLinks(base, listOpts.Selector[0], listOpts.limit, listOpts.offset, listOpts.include)
|
links, path, _, err := book.GetLinks(base, listOpts.Selector[0], listOpts.limit, listOpts.offset, listOpts.reverse, listOpts.include)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -14,6 +14,6 @@ var versionCmd = &cobra.Command{
|
|||||||
Use: "version",
|
Use: "version",
|
||||||
Short: "Print the version number of papeer",
|
Short: "Print the version number of papeer",
|
||||||
Run: func(cmd *cobra.Command, args []string) {
|
Run: func(cmd *cobra.Command, args []string) {
|
||||||
fmt.Println("papeer v0.4.1")
|
fmt.Println("papeer v0.4.2")
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|||||||
11
release.sh
11
release.sh
@@ -1,14 +1,14 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
version=$1
|
|
||||||
platforms=("linux/amd64" "darwin/amd64" "windows/amd64")
|
|
||||||
|
|
||||||
if [ "$#" -ne 1 ]; then
|
if [ "$#" -ne 1 ]; then
|
||||||
echo "Illegal number of parameters"
|
echo "Illegal number of parameters"
|
||||||
echo "Usage: ./release.sh X.X.X"
|
echo "Usage: ./release.sh X.X.X"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
version=$1
|
||||||
|
platforms=("linux/amd64" "darwin/amd64" "windows/amd64")
|
||||||
|
|
||||||
for platform in "${platforms[@]}"
|
for platform in "${platforms[@]}"
|
||||||
do
|
do
|
||||||
platform_split=(${platform//\// })
|
platform_split=(${platform//\// })
|
||||||
@@ -25,9 +25,4 @@ do
|
|||||||
tar czvf "$output_name-v$version-$GOOS-$GOARCH.tar.gz" "$output_name"
|
tar czvf "$output_name-v$version-$GOOS-$GOARCH.tar.gz" "$output_name"
|
||||||
rm "$output_name"
|
rm "$output_name"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# if [ $? -ne 0 ]; then
|
|
||||||
# echo 'An error has occurred! Aborting the script execution...'
|
|
||||||
# exit 1
|
|
||||||
# fi
|
|
||||||
done
|
done
|
||||||
|
|||||||
Reference in New Issue
Block a user