diff --git a/README.md b/README.md index 716292e..67295cd 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,7 @@ Flags: -o, --offset int skip first chapters, use with depth/selector --output string file name (default: book name) -q, --quiet hide progress bar + -r, --reverse reverse chapter order -s, --selector strings table of contents CSS selector -t, --threads int download concurrency, use with depth/selector (default -1) --use-link-name use link name for chapter title @@ -77,7 +78,7 @@ You can chain this option to grab several level of pages with diferent selectors ### Display the table of contents -Before actually scraping a whole website, it is a good idea to use the `list` command. This command is like a **dry run**, which lets you vizualize the content before actually retrieving it. You can use several options to customize the table of contents extraction, such as `selector`, `limit`, `offset` and `include`. Type `papeer list --help` for more information about those options. +Before actually scraping a whole website, it is a good idea to use the `list` command. This command is like a **dry run**, which lets you vizualize the content before actually retrieving it. You can use several options to customize the table of contents extraction, such as `selector`, `limit`, `offset`, `reverse` and `include`. Type `papeer list --help` for more information about those options. ```sh papeer list https://12factor.net/ -s 'section.concrete>article>h2>a' @@ -137,7 +138,7 @@ go get -u github.com/lapwat/papeer ```sh # use platform=darwin for MacOS platform=linux -release=0.4.1 +release=0.4.2 # download and extract curl -L https://github.com/lapwat/papeer/releases/download/v$release/papeer-v$release-$platform-amd64.tar.gz > papeer.tar.gz @@ -150,7 +151,7 @@ sudo mv papeer /usr/local/bin ### Windows -Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.4.1/papeer-v0.4.1-windows-amd64.exe.zip). +Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.4.2/papeer-v0.4.2-windows-amd64.exe.zip). ## MOBI support diff --git a/book/scraper.go b/book/scraper.go index 041640f..4b2e910 100644 --- a/book/scraper.go +++ b/book/scraper.go @@ -23,6 +23,7 @@ type ScrapeConfig struct { Quiet bool Limit int Offset int + Reverse bool Delay int Threads int Include bool @@ -31,7 +32,7 @@ type ScrapeConfig struct { } func NewScrapeConfig() *ScrapeConfig { - return &ScrapeConfig{0, "", false, -1, 0, -1, -1, true, false, false} + return &ScrapeConfig{0, "", false, -1, 0, false, -1, -1, true, false, false} } func NewScrapeConfigs(selectors []string) []*ScrapeConfig { @@ -167,21 +168,24 @@ func NewChapterFromURL(url, linkName string, configs []*ScrapeConfig, index int, updateProgressBarName(index, name) } - subchapters := []chapter{} + var subchapters []chapter if len(configs) > 1 { - // add subchapters - links, _, _, err := GetLinks(base, config.Selector, config.Limit, config.Offset, false) + // retrieve links on page + links, _, _, err := GetLinks(base, config.Selector, config.Limit, config.Offset, config.Reverse, false) if err != nil { log.Fatal(err) } - subchapters = make([]chapter, len(links)) + // init progess bar var p progress if config.Quiet == false { p = NewProgress(links, name, config.Depth) } + // init chapters list + subchapters = make([]chapter, len(links)) + if config.Delay >= 0 { // synchronous mode @@ -277,7 +281,7 @@ func tableOfContent(url string, config *ScrapeConfig, subConfig *ScrapeConfig, q log.Fatal(err) } - links, _, home, err := GetLinks(base, config.Selector, config.Limit, config.Offset, config.Include) + links, _, home, err := GetLinks(base, config.Selector, config.Limit, config.Offset, config.Reverse, config.Include) if err != nil { log.Fatal(err) } @@ -370,7 +374,7 @@ func GetPath(elm *goquery.Selection) string { return join } -func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool) ([]link, string, chapter, error) { +func GetLinks(url *urllib.URL, selector string, limit, offset int, reverse, include bool) ([]link, string, chapter, error) { selectorSet := true if len(selector) == 0 { selector = "a" @@ -434,10 +438,18 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool) home := NewChapterFromURL(url.String(), "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {}) + // include home page if include { l := NewLink(url.String(), home.Name()) links = append([]link{l}, links...) } + // reverse links + if reverse { + for i, j := 0, len(links)-1; i < j; i, j = i+1, j-1 { + links[i], links[j] = links[j], links[i] + } + } + return links, pathMax, home, nil } diff --git a/book/scraper_test.go b/book/scraper_test.go index f013161..79c8ce9 100644 --- a/book/scraper_test.go +++ b/book/scraper_test.go @@ -182,6 +182,24 @@ func TestSubChaptersLimitOver(t *testing.T) { } +func TestReverse(t *testing.T) { + + config0 := NewScrapeConfig() + config0.Reverse = true + + config1 := NewScrapeConfig() + + c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {}) + + got := c.SubChapters()[0].Name() + want := "The Twelve-Factor App" + + if got != want { + t.Errorf("got %v, wanted %v", got, want) + } + +} + func TestNotInclude(t *testing.T) { config := NewScrapeConfig() diff --git a/cmd/get.go b/cmd/get.go index 57662da..5cfe6d8 100644 --- a/cmd/get.go +++ b/cmd/get.go @@ -25,6 +25,7 @@ type GetOptions struct { depth int limit int offset int + reverse bool delay int threads int // includeUrl bool @@ -49,6 +50,7 @@ func init() { getCmd.Flags().IntVarP(&getOpts.depth, "depth", "d", 0, "scraping depth") getCmd.Flags().IntVarP(&getOpts.limit, "limit", "l", -1, "limit number of chapters, use with depth/selector") getCmd.Flags().IntVarP(&getOpts.offset, "offset", "o", 0, "skip first chapters, use with depth/selector") + getCmd.Flags().BoolVarP(&getOpts.reverse, "reverse", "r", false, "reverse chapter order") getCmd.Flags().IntVarP(&getOpts.delay, "delay", "", -1, "time in milliseconds to wait before downloading next chapter, use with depth/selector") getCmd.Flags().IntVarP(&getOpts.threads, "threads", "t", -1, "download concurrency, use with depth/selector") getCmd.Flags().BoolVarP(&getOpts.include, "include", "i", false, "include URL as first chapter, use with depth/selector") @@ -96,6 +98,10 @@ var getCmd = &cobra.Command{ return errors.New("cannot use offset option if depth/selector is not specified") } + if cmd.Flags().Changed("reverse") && getOpts.depth == 0 && len(getOpts.Selector) == 0 { + return errors.New("cannot use reverse option if depth/selector is not specified") + } + if cmd.Flags().Changed("delay") && getOpts.depth == 0 && len(getOpts.Selector) == 0 { return errors.New("cannot use delay option if depth/selector is not specified") } @@ -122,7 +128,6 @@ var getCmd = &cobra.Command{ for len(getOpts.Selector) < getOpts.depth+1 { getOpts.Selector = append(getOpts.Selector, "") } - fmt.Println(len(getOpts.Selector)) // generate config for each level configs := make([]*book.ScrapeConfig, len(getOpts.Selector)) @@ -132,6 +137,7 @@ var getCmd = &cobra.Command{ config.Quiet = getOpts.quiet config.Limit = getOpts.limit config.Offset = getOpts.offset + config.Reverse = getOpts.reverse config.Delay = getOpts.delay config.Threads = getOpts.threads config.ImagesOnly = getOpts.images diff --git a/cmd/list.go b/cmd/list.go index 867e75a..fc4430e 100644 --- a/cmd/list.go +++ b/cmd/list.go @@ -21,6 +21,7 @@ type ListOptions struct { depth int limit int offset int + reverse bool delay int threads int // includeUrl bool @@ -33,10 +34,12 @@ var listOpts *ListOptions func init() { listOpts = &ListOptions{} + // common with get command listCmd.Flags().StringSliceVarP(&listOpts.Selector, "selector", "s", []string{}, "table of contents CSS selector") listCmd.Flags().IntVarP(&listOpts.depth, "depth", "d", 0, "scraping depth") listCmd.Flags().IntVarP(&listOpts.limit, "limit", "l", -1, "limit number of chapters, use with depth/selector") listCmd.Flags().IntVarP(&listOpts.offset, "offset", "o", 0, "skip first chapters, use with depth/selector") + listCmd.Flags().BoolVarP(&listOpts.reverse, "reverse", "r", false, "reverse chapter order") listCmd.Flags().IntVarP(&listOpts.delay, "delay", "", -1, "time in milliseconds to wait before downloading next chapter, use with depth/selector") listCmd.Flags().IntVarP(&listOpts.threads, "threads", "t", -1, "download concurrency, use with depth/selector") listCmd.Flags().BoolVarP(&listOpts.include, "include", "i", false, "include URL as first chapter, use with depth/selector") @@ -66,7 +69,7 @@ var listCmd = &cobra.Command{ log.Fatal(err) } - links, path, _, err := book.GetLinks(base, listOpts.Selector[0], listOpts.limit, listOpts.offset, listOpts.include) + links, path, _, err := book.GetLinks(base, listOpts.Selector[0], listOpts.limit, listOpts.offset, listOpts.reverse, listOpts.include) if err != nil { log.Fatal(err) } diff --git a/cmd/version.go b/cmd/version.go index a2f286c..650c6b5 100644 --- a/cmd/version.go +++ b/cmd/version.go @@ -14,6 +14,6 @@ var versionCmd = &cobra.Command{ Use: "version", Short: "Print the version number of papeer", Run: func(cmd *cobra.Command, args []string) { - fmt.Println("papeer v0.4.1") + fmt.Println("papeer v0.4.2") }, } diff --git a/release.sh b/release.sh index 8bc9182..031f324 100755 --- a/release.sh +++ b/release.sh @@ -1,14 +1,14 @@ #!/usr/bin/env bash -version=$1 -platforms=("linux/amd64" "darwin/amd64" "windows/amd64") - if [ "$#" -ne 1 ]; then echo "Illegal number of parameters" echo "Usage: ./release.sh X.X.X" exit 1 fi +version=$1 +platforms=("linux/amd64" "darwin/amd64" "windows/amd64") + for platform in "${platforms[@]}" do platform_split=(${platform//\// }) @@ -25,9 +25,4 @@ do tar czvf "$output_name-v$version-$GOOS-$GOARCH.tar.gz" "$output_name" rm "$output_name" fi - - # if [ $? -ne 0 ]; then - # echo 'An error has occurred! Aborting the script execution...' - # exit 1 - # fi done