diff --git a/Makefile b/Makefile index 1574cc5..4abac44 100644 --- a/Makefile +++ b/Makefile @@ -8,3 +8,4 @@ clean: find . -maxdepth 1 -not -name 'README.md' -name '*.md' -delete find . -maxdepth 1 -name '*.epub' -delete find . -maxdepth 1 -name '*.mobi' -delete + find . -maxdepth 1 -name 'papeer-v*' -delete diff --git a/README.md b/README.md index 4bc230f..5a0d812 100644 --- a/README.md +++ b/README.md @@ -59,13 +59,20 @@ The `recursive` option lets you extract the table of content of a website, then Before trying the `recursive` option, it is a good idea to use the `ls` option, which lets you vizualize the content that will be retrieved. You can use several options to customize the table of content extraction, such as `selector`, `limit`, `offset` and `include`. Type `papeer help` for more information about those options. ```sh -papeer ls https://news.ycombinator.com/ --limit=5 -# # NAME URL -# 1 Tailwind CSS v3.0 https://tailwindcss.com/blog/tailwindcss-v3 -# 2 A molten salt storage solution using sodium hydroxide https://sifted.eu/articles/salt-energy-storage-seaborg-hyme/ -# 3 HashiCorp IPO today https://www.hashicorp.com/blog/a-new-chapter-for-hashicorp -# 4 Stack Graphs https://github.blog/2021-12-09-introducing-stack-graphs/ -# 5 ‘Tipping point’ makes partisan polarization irreversible https://news.cornell.edu/stories/2021/12/tipping-point-makes-partisan-polarization-irreversible +papeer ls https://12factor.net/ -s 'section.concrete > article > h2 > a' +# # NAME URL +# 1 I. Codebase https://12factor.net/codebase +# 2 II. Dependencies https://12factor.net/dependencies +# 3 III. Config https://12factor.net/config +# 4 IV. Backing services https://12factor.net/backing-services +# 5 V. Build, release, run https://12factor.net/build-release-run +# 6 VI. Processes https://12factor.net/processes +# 7 VII. Port binding https://12factor.net/port-binding +# 8 VIII. Concurrency https://12factor.net/concurrency +# 9 IX. Disposability https://12factor.net/disposability +# 10 X. Dev/prod parity https://12factor.net/dev-prod-parity +# 11 XI. Logs https://12factor.net/logs +# 12 XII. Admin processes https://12factor.net/admin-processes ``` ### Scrape time @@ -73,14 +80,21 @@ papeer ls https://news.ycombinator.com/ --limit=5 Once you are satisfied with the table of content listed by the `ls` command, you can actually scrape the content of those pages. You can use the same options that you specified for the `ls` command. In recursive mode, you also have the possibility to use `delay` and `threads` options. ```sh -papeer get https://news.ycombinator.com/ --recursive --delay=500 --limit=5 --format=md -# [========================================>---------------------------] Chapters 3 / 5 -# [====================================================================] 1. Tailwind CSS v3.0 -# [====================================================================] 2. A molten salt storage solution using sodium hydroxide -# [====================================================================] 3. HashiCorp IPO today -# [--------------------------------------------------------------------] 4. Stack Graphs -# [--------------------------------------------------------------------] 5. ‘Tipping point’ makes partisan polarization irreversible -# Markdown saved to "Hacker News.md" +papeer get https://12factor.net/ --recursive -s 'section.concrete > article > h2 > a' --format=md +# [======================================>-----------------------------] Chapters 7 / 12 +# [====================================================================] 1. I. Codebase +# [====================================================================] 2. II. Dependencies +# [====================================================================] 3. III. Config +# [====================================================================] 4. IV. Backing services +# [====================================================================] 5. V. Build, release, run +# [====================================================================] 6. VI. Processes +# [====================================================================] 7. VII. Port binding +# [--------------------------------------------------------------------] 8. VIII. Concurrency +# [--------------------------------------------------------------------] 9. IX. Disposability +# [--------------------------------------------------------------------] 10. X. Dev/prod parity +# [--------------------------------------------------------------------] 11. XI. Logs +# [--------------------------------------------------------------------] 12. XII. Admin processes +# Markdown saved to "The_Twelve-Factor_App.md" ``` # Installation diff --git a/book/chapter.go b/book/chapter.go index 841b7a4..b9fb2a7 100644 --- a/book/chapter.go +++ b/book/chapter.go @@ -1,13 +1,20 @@ package book type chapter struct { - name string - author string - content string + body string + name string + author string + content string + subChapters []chapter + config *ScrapeConfig } -func NewChapter(name, author, content string) chapter { - return chapter{name, author, content} +func NewChapter(body, name, author, content string, subChapters []chapter, config *ScrapeConfig) chapter { + return chapter{body, name, author, content, subChapters, config} +} + +func (c chapter) Body() string { + return c.body } func (c chapter) Name() string { @@ -21,3 +28,7 @@ func (c chapter) Author() string { func (c chapter) Content() string { return c.content } + +func (c chapter) SubChapters() []chapter { + return c.subChapters +} diff --git a/book/format.go b/book/format.go new file mode 100644 index 0000000..6a8d0df --- /dev/null +++ b/book/format.go @@ -0,0 +1,90 @@ +package book + +import ( + "fmt" + "log" + "strings" + + md "github.com/JohannesKaufmann/html-to-markdown" + "github.com/PuerkitoBio/goquery" + epub "github.com/bmaupin/go-epub" +) + +func ToMarkdown(c chapter) string { + + // make title + underline := strings.Repeat("=", len(c.Name())) + title := fmt.Sprintf("%s\n%s", c.Name(), underline) + + // convert content to markdown + content, err := md.NewConverter("", true, nil).ConvertString(c.Content()) + if err != nil { + log.Fatal(err) + } + + // merge title and content + content = fmt.Sprintf("%s\n\n%s", title, content) + + for _, sc := range c.SubChapters() { + // merge subchapters + content = fmt.Sprintf("%s\n\n\n%s", content, ToMarkdown(sc)) + } + + return content +} + +func ToEpub(c chapter, filename string) { + if len(filename) == 0 { + filename = fmt.Sprintf("%s.epub", c.Name()) + } + + // init ebook + e := epub.NewEpub(c.Name()) + e.SetAuthor(c.Author()) + + AppendToEpub(e, c, false) + + err := e.Write(filename) + if err != nil { + log.Fatal(err) + } + + fmt.Printf("Ebook saved to \"%s\"\n", filename) +} + +func AppendToEpub(e *epub.Epub, c chapter, imagesOnly bool) { + content := "" + + if imagesOnly == false { + content = c.Content() + } + + // parse content + doc, err := goquery.NewDocumentFromReader(strings.NewReader(c.Content())) + if err != nil { + log.Fatal(err) + } + + // download images and replace src in img tags of content + doc.Find("img").Each(func(i int, s *goquery.Selection) { + src, _ := s.Attr("src") + imagePath, _ := e.AddImage(src, "") + + if imagesOnly { + imageTag, _ := goquery.OuterHtml(s) + content += strings.Replace(imageTag, src, imagePath, 1) + } else { + content = strings.Replace(content, src, imagePath, 1) + } + }) + + html := fmt.Sprintf("

%s

%s", c.Name(), content) + _, err = e.AddSection(html, c.Name(), "", "") + if err != nil { + log.Fatal(err) + } + + for _, sc := range c.SubChapters() { + AppendToEpub(e, sc, false) + } +} diff --git a/book/format_test.go b/book/format_test.go new file mode 100644 index 0000000..0f7accf --- /dev/null +++ b/book/format_test.go @@ -0,0 +1,49 @@ +package book + +import ( + "errors" + "os" + "testing" +) + +func TestToMarkdown(t *testing.T) { + + c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}) + + got := ToMarkdown(c) + want := "Books\n=====\n\n- [Discours de la Méthode](https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/)clock 98 min read -\n1637\n\n- [The Twelve-Factor App](https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/)clock 22 min read -\n2011" + + if got != want { + t.Errorf("got %q, wanted %q", got, want) + } +} + +func TestToEpub(t *testing.T) { + + filename := "ebook.epub" + c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}) + ToEpub(c, filename) + + if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) { + t.Errorf("%s does not exist: %v", filename, err) + } else { + if err := os.Remove(filename); err != nil { + t.Errorf("cannot remove %v: %v", filename, err) + } + } +} + +func TestToEpubNoFilename(t *testing.T) { + + filename := "Books.epub" + c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}) + ToEpub(c, "") + + if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) { + t.Errorf("%s does not exist: %v", filename, err) + } else { + if err := os.Remove(filename); err != nil { + t.Errorf("cannot remove %v: %v", filename, err) + } + } +} diff --git a/book/scraper.go b/book/scraper.go index 5956d1f..b0bb0dd 100644 --- a/book/scraper.go +++ b/book/scraper.go @@ -1,9 +1,12 @@ package book import ( + "bytes" "fmt" + "io" "log" "math" + "net/http" urllib "net/url" "strings" "sync" @@ -14,14 +17,33 @@ import ( colly "github.com/gocolly/colly/v2" ) -func NewBookFromURL(url, selector, name, author string, recursive, include bool, limit, offset, delay, threads int) book { +type ScrapeConfig struct { + selector string + limit int + include bool + imagesOnly bool +} + +func NewScrapeConfig() *ScrapeConfig { + return &ScrapeConfig{"", -1, true, false} +} + +func NewBookFromURL(url, selector, name, author string, recursive, include, imagesOnly bool, limit, offset, delay, threads int) book { + config1 := NewScrapeConfig() + config1.imagesOnly = imagesOnly + var chapters []chapter var home chapter if recursive { - chapters, home = tableOfContent(url, selector, limit, offset, delay, threads, include) + config2 := NewScrapeConfig() + config2.selector = selector + config2.limit = limit + config2.include = include + config2.imagesOnly = imagesOnly + chapters, home = tableOfContent(url, config1.selector, config1.limit, offset, delay, threads, config1.include) } else { - chapters = []chapter{NewChapterFromURL(url)} + chapters = []chapter{NewChapterFromURL(url, []*ScrapeConfig{config1})} home = chapters[0] } @@ -41,30 +63,82 @@ func NewBookFromURL(url, selector, name, author string, recursive, include bool, return b } -func NewChapterFromURL(url string) chapter { - article, err := readability.FromURL(url, 30*time.Second) +func NewChapterFromURL(url string, configs []*ScrapeConfig) chapter { + config := configs[0] + content := "" + + base, err := urllib.Parse(url) + if err != nil { + log.Fatal(err) + } + + subchapters := []chapter{} + if len(configs) > 1 { + // add subchapters + + links, _, err := GetLinks(base, config.selector, config.limit, 0, false) + if err != nil { + log.Fatal(err) + } + + for _, link := range links { + // and then use it to parse relative URLs + u, err := base.Parse(link.href) + if err != nil { + log.Fatal(err) + } + + subchapters = append(subchapters, NewChapterFromURL(u.String(), configs[1:])) + } + } + + // we want the metadata anyway + + // get page body + response, err := http.Get(url) + if err != nil { + log.Fatal(err) + } + defer response.Body.Close() + + // duplicate response stream + readabilityReader := &bytes.Buffer{} + bodyReader := io.TeeReader(response.Body, readabilityReader) + + // extract HTML body + body, err := io.ReadAll(bodyReader) + + // extract content + article, err := readability.FromReader(readabilityReader, base) if err != nil { log.Fatalf("failed to parse %s, %v\n", url, err) } - content := strings.ReplaceAll(article.Content, "\n", "") + // we don't care about the content if we do not include this level - // if images { - // // parse html content - // doc, err := goquery.NewDocumentFromReader(strings.NewReader(content)) - // if err != nil { - // log.Fatal(err) - // } + if config.include { + content = article.Content - // // extract images only - // content = "" - // doc.Find("img").Each(func(i int, s *goquery.Selection) { - // newContent, _ := goquery.OuterHtml(s) - // content += newContent - // }) - // } + // extract images + if config.imagesOnly { - return chapter{article.Title, article.Byline, content} + // parse HTML + doc, err := goquery.NewDocumentFromReader(strings.NewReader(content)) + if err != nil { + log.Fatal(err) + } + + // append every image to content + content = "" + doc.Find("img").Each(func(i int, s *goquery.Selection) { + imageTag, _ := goquery.OuterHtml(s) + content += imageTag + }) + + } + } + + return chapter{string(body), article.Title, article.Byline, content, subchapters, config} } func tableOfContent(url, selector string, limit, offset, delay, threads int, include bool) ([]chapter, chapter) { @@ -91,7 +165,7 @@ func tableOfContent(url, selector string, limit, offset, delay, threads int, inc log.Fatal(err) } - chapters[index] = NewChapterFromURL(u.String()) + chapters[index] = NewChapterFromURL(u.String(), []*ScrapeConfig{NewScrapeConfig()}) progress.Incr(index) // short sleep for last chapter to let the progress bar update @@ -125,7 +199,7 @@ func tableOfContent(url, selector string, limit, offset, delay, threads int, inc log.Fatal(err) } - chapters[index] = NewChapterFromURL(u.String()) + chapters[index] = NewChapterFromURL(u.String(), []*ScrapeConfig{NewScrapeConfig()}) progress.Incr(index) <-semaphore @@ -202,7 +276,7 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool) links = links[offset:end] - home := NewChapterFromURL(url.String()) + home := NewChapterFromURL(url.String(), []*ScrapeConfig{NewScrapeConfig()}) if include { l := NewLink(url.String(), home.Name()) diff --git a/book/scraper_test.go b/book/scraper_test.go new file mode 100644 index 0000000..22635c0 --- /dev/null +++ b/book/scraper_test.go @@ -0,0 +1,131 @@ +package book + +import "testing" + +func TestBody(t *testing.T) { + + c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}) + + got := c.Body() + want := "\n\n \n Books\n \n \n \n \n \n \n \n\n \n \n\n\n\n \n\n\n\n\n\n\n \n \n
\n \"John\n

Books

\n

\n
\n \n
\n
\n
\n \n
\n

Books

\n \n \n\n\n\n
\n\n
\n \n\n" + + if got != want { + t.Errorf("got %v, wanted %v", got, want) + } +} + +func TestName(t *testing.T) { + + c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}) + + got := c.Name() + want := "Books" + + if got != want { + t.Errorf("got %v, wanted %v", got, want) + } +} + +func TestAuthor(t *testing.T) { + + c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}) + + got := c.Author() + want := "John Doe" + + if got != want { + t.Errorf("got %v, wanted %v", got, want) + } +} + +func TestContent(t *testing.T) { + + c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}) + + got := c.Content() + want := "
\n \n
\n \n \n\n
\n \n\n
" + + if got != want { + t.Errorf("got %v, wanted %v", got, want) + } +} + +func TestContentImagesOnly(t *testing.T) { + + config := NewScrapeConfig() + config.imagesOnly = true + c := NewChapterFromURL("https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/", []*ScrapeConfig{config}) + + got := c.Content() + want := "\"One\"A\"Code\"Scale" + + if got != want { + t.Errorf("got %v, wanted %v", got, want) + } +} + +func TestSubChapters(t *testing.T) { + + c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig(), NewScrapeConfig()}) + + got := len(c.SubChapters()) + want := 2 + + if got != want { + t.Errorf("got %v, wanted %v", got, want) + } +} + +func TestSubChaptersSelector(t *testing.T) { + + c := NewChapterFromURL("https://12factor.net/", []*ScrapeConfig{{"section.concrete > article > h2 > a", -1, true, false}, NewScrapeConfig()}) + + got := len(c.SubChapters()) + want := 12 + + if got != want { + t.Errorf("got %v, wanted %v", got, want) + } +} + +func TestSubChaptersLimit(t *testing.T) { + + config := NewScrapeConfig() + config.limit = 1 + c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config, NewScrapeConfig()}) + + got := len(c.SubChapters()) + want := 1 + + if got != want { + t.Errorf("got %v, wanted %v", got, want) + } +} + +func TestSubChaptersLimitOver(t *testing.T) { + + config := NewScrapeConfig() + config.limit = 3 + c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config, NewScrapeConfig()}) + + got := len(c.SubChapters()) + want := 2 + + if got != want { + t.Errorf("got %v, wanted %v", got, want) + } +} + +func TestNotInclude(t *testing.T) { + + config := NewScrapeConfig() + config.include = false + c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config}) + + got := c.Content() + want := "" + + if got != want { + t.Errorf("got %v, wanted %v", got, want) + } +} diff --git a/cmd/get.go b/cmd/get.go index 66cc70c..51630d7 100644 --- a/cmd/get.go +++ b/cmd/get.go @@ -77,7 +77,7 @@ var getCmd = &cobra.Command{ }, Run: func(cmd *cobra.Command, args []string) { url := args[0] - b := book.NewBookFromURL(url, selector, name, author, recursive, include, limit, offset, delay, threads) + b := book.NewBookFromURL(url, selector, name, author, recursive, include, images, limit, offset, delay, threads) if len(output) == 0 { // set default output diff --git a/release.sh b/release.sh index 73d4514..cdd6d2a 100755 --- a/release.sh +++ b/release.sh @@ -3,12 +3,18 @@ version=$1 platforms=("linux/amd64" "darwin/amd64" "windows/amd64") +if [ "$#" -ne 1 ]; then + echo "Illegal number of parameters" + echo "Usage: ./release.sh X.X.X" + exit 1 +fi + for platform in "${platforms[@]}" do platform_split=(${platform//\// }) GOOS=${platform_split[0]} GOARCH=${platform_split[1]} - output_name='papeer-'$version'-'$GOOS'-'$GOARCH + output_name='papeer-v'$version'-'$GOOS'-'$GOARCH if [ $GOOS = "windows" ]; then output_name+='.exe' fi