From d5971a2819780296869fbf2163c949fa156db071 Mon Sep 17 00:00:00 2001 From: lapwat Date: Sun, 10 Oct 2021 22:02:39 +0200 Subject: [PATCH] add threads option --- README.md | 3 ++- book/progress.go | 2 +- book/scraper.go | 17 ++++++++++++++--- cmd/get.go | 12 ++++++++++-- cmd/root.go | 1 + cmd/utils.go | 5 ----- 6 files changed, 28 insertions(+), 12 deletions(-) delete mode 100644 cmd/utils.go diff --git a/README.md b/README.md index 734998c..8e144ae 100644 --- a/README.md +++ b/README.md @@ -75,8 +75,9 @@ Flags: -o, --offset int skip first chapters, in recursive mode --output string output file -r, --recursive create one chapter per natigation item - -s, --selector string table of content CSS selector + -s, --selector string table of content CSS selector, in resursive mode --stdout print to standard output + -t, --threads int download concurrency, in recursive mode (default -1) Use "papeer [command] --help" for more information about a command. ``` diff --git a/book/progress.go b/book/progress.go index f642092..1f30ee6 100644 --- a/book/progress.go +++ b/book/progress.go @@ -19,8 +19,8 @@ func NewProgress(links []link) progress { return fmt.Sprintf("Chapters %d / %d", b.Current(), len(links)) }) - individuals := []*uiprogress.Bar{} // hide individual bars if more than 50 chapters + individuals := []*uiprogress.Bar{} if len(links) <= 50 { for index, link := range links { bar := uiprogress.AddBar(1) diff --git a/book/scraper.go b/book/scraper.go index 6791e50..0bd74ed 100644 --- a/book/scraper.go +++ b/book/scraper.go @@ -14,9 +14,9 @@ import ( colly "github.com/gocolly/colly/v2" ) -func NewBookFromURL(url, selector string, recursive, include, images bool, limit, offset, delay int) book { +func NewBookFromURL(url, selector string, recursive, include, images bool, limit, offset, delay, threads int) book { if recursive { - chapters := tableOfContent(url, selector, limit, offset, delay, include, images) + chapters := tableOfContent(url, selector, limit, offset, delay, threads, include, images) b := New(chapters[0].Name(), chapters[0].Author()) for _, c := range chapters { @@ -56,7 +56,7 @@ func NewChapterFromURL(url string, images bool) chapter { return chapter{article.Title, article.Byline, content} } -func tableOfContent(url, selector string, limit, offset, delay int, include, images bool) []chapter { +func tableOfContent(url, selector string, limit, offset, delay, threads int, include, images bool) []chapter { base, err := urllib.Parse(url) if err != nil { log.Fatal(err) @@ -71,6 +71,7 @@ func tableOfContent(url, selector string, limit, offset, delay int, include, ima progress := NewProgress(links) if delay >= 0 { + // synchronous mode for index, link := range links { // and then use it to parse relative URLs @@ -91,10 +92,19 @@ func tableOfContent(url, selector string, limit, offset, delay int, include, ima } } else { + // asynchronous mode var wg sync.WaitGroup + + if threads == -1 { + threads = len(links) + } + semaphore := make(chan bool, threads) + for index, l := range links { wg.Add(1) + semaphore <- true + go func(index int, l link) { defer wg.Done() @@ -107,6 +117,7 @@ func tableOfContent(url, selector string, limit, offset, delay int, include, ima chapters[index] = NewChapterFromURL(u.String(), images) progress.Incr(index) + <-semaphore }(index, l) } wg.Wait() diff --git a/cmd/get.go b/cmd/get.go index dd8e54e..1772899 100644 --- a/cmd/get.go +++ b/cmd/get.go @@ -17,7 +17,7 @@ import ( var stdout, recursive, include, images bool var format, output, selector string -var limit, offset, delay int +var limit, offset, delay, threads int var getCmd = &cobra.Command{ Use: "get", @@ -68,11 +68,19 @@ var getCmd = &cobra.Command{ return errors.New("cannot use delay option if not in recursive mode") } + if cmd.Flags().Changed("threads") && recursive == false { + return errors.New("cannot use threads option if not in recursive mode") + } + + if cmd.Flags().Changed("delay") && cmd.Flags().Changed("threads") { + return errors.New("cannot use delay and threads options at the same time") + } + return nil }, Run: func(cmd *cobra.Command, args []string) { url := args[0] - b := book.NewBookFromURL(url, selector, recursive, include, images, limit, offset, delay) + b := book.NewBookFromURL(url, selector, recursive, include, images, limit, offset, delay, threads) if len(output) == 0 { // set default output diff --git a/cmd/root.go b/cmd/root.go index 251f211..eba67e2 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -33,6 +33,7 @@ func init() { rootCmd.PersistentFlags().IntVarP(&limit, "limit", "l", -1, "limit number of chapters, in recursive mode") rootCmd.PersistentFlags().IntVarP(&offset, "offset", "o", 0, "skip first chapters, in recursive mode") rootCmd.PersistentFlags().IntVarP(&delay, "delay", "d", -1, "time to wait before downloading next chapter, in milliseconds") + rootCmd.PersistentFlags().IntVarP(&threads, "threads", "t", -1, "download concurrency, in recursive mode") rootCmd.AddCommand(getCmd) rootCmd.AddCommand(listCmd) diff --git a/cmd/utils.go b/cmd/utils.go deleted file mode 100644 index 5411e9e..0000000 --- a/cmd/utils.go +++ /dev/null @@ -1,5 +0,0 @@ -package cmd - -func getTableOfContent() { - -}