mirror of
https://github.com/NohamR/papeer.git
synced 2026-05-25 04:17:19 +00:00
add test suites, scrape config
This commit is contained in:
1
Makefile
1
Makefile
@@ -8,3 +8,4 @@ clean:
|
||||
find . -maxdepth 1 -not -name 'README.md' -name '*.md' -delete
|
||||
find . -maxdepth 1 -name '*.epub' -delete
|
||||
find . -maxdepth 1 -name '*.mobi' -delete
|
||||
find . -maxdepth 1 -name 'papeer-v*' -delete
|
||||
|
||||
44
README.md
44
README.md
@@ -59,13 +59,20 @@ The `recursive` option lets you extract the table of content of a website, then
|
||||
Before trying the `recursive` option, it is a good idea to use the `ls` option, which lets you vizualize the content that will be retrieved. You can use several options to customize the table of content extraction, such as `selector`, `limit`, `offset` and `include`. Type `papeer help` for more information about those options.
|
||||
|
||||
```sh
|
||||
papeer ls https://news.ycombinator.com/ --limit=5
|
||||
# # NAME URL
|
||||
# 1 Tailwind CSS v3.0 https://tailwindcss.com/blog/tailwindcss-v3
|
||||
# 2 A molten salt storage solution using sodium hydroxide https://sifted.eu/articles/salt-energy-storage-seaborg-hyme/
|
||||
# 3 HashiCorp IPO today https://www.hashicorp.com/blog/a-new-chapter-for-hashicorp
|
||||
# 4 Stack Graphs https://github.blog/2021-12-09-introducing-stack-graphs/
|
||||
# 5 ‘Tipping point’ makes partisan polarization irreversible https://news.cornell.edu/stories/2021/12/tipping-point-makes-partisan-polarization-irreversible
|
||||
papeer ls https://12factor.net/ -s 'section.concrete > article > h2 > a'
|
||||
# # NAME URL
|
||||
# 1 I. Codebase https://12factor.net/codebase
|
||||
# 2 II. Dependencies https://12factor.net/dependencies
|
||||
# 3 III. Config https://12factor.net/config
|
||||
# 4 IV. Backing services https://12factor.net/backing-services
|
||||
# 5 V. Build, release, run https://12factor.net/build-release-run
|
||||
# 6 VI. Processes https://12factor.net/processes
|
||||
# 7 VII. Port binding https://12factor.net/port-binding
|
||||
# 8 VIII. Concurrency https://12factor.net/concurrency
|
||||
# 9 IX. Disposability https://12factor.net/disposability
|
||||
# 10 X. Dev/prod parity https://12factor.net/dev-prod-parity
|
||||
# 11 XI. Logs https://12factor.net/logs
|
||||
# 12 XII. Admin processes https://12factor.net/admin-processes
|
||||
```
|
||||
|
||||
### Scrape time
|
||||
@@ -73,14 +80,21 @@ papeer ls https://news.ycombinator.com/ --limit=5
|
||||
Once you are satisfied with the table of content listed by the `ls` command, you can actually scrape the content of those pages. You can use the same options that you specified for the `ls` command. In recursive mode, you also have the possibility to use `delay` and `threads` options.
|
||||
|
||||
```sh
|
||||
papeer get https://news.ycombinator.com/ --recursive --delay=500 --limit=5 --format=md
|
||||
# [========================================>---------------------------] Chapters 3 / 5
|
||||
# [====================================================================] 1. Tailwind CSS v3.0
|
||||
# [====================================================================] 2. A molten salt storage solution using sodium hydroxide
|
||||
# [====================================================================] 3. HashiCorp IPO today
|
||||
# [--------------------------------------------------------------------] 4. Stack Graphs
|
||||
# [--------------------------------------------------------------------] 5. ‘Tipping point’ makes partisan polarization irreversible
|
||||
# Markdown saved to "Hacker News.md"
|
||||
papeer get https://12factor.net/ --recursive -s 'section.concrete > article > h2 > a' --format=md
|
||||
# [======================================>-----------------------------] Chapters 7 / 12
|
||||
# [====================================================================] 1. I. Codebase
|
||||
# [====================================================================] 2. II. Dependencies
|
||||
# [====================================================================] 3. III. Config
|
||||
# [====================================================================] 4. IV. Backing services
|
||||
# [====================================================================] 5. V. Build, release, run
|
||||
# [====================================================================] 6. VI. Processes
|
||||
# [====================================================================] 7. VII. Port binding
|
||||
# [--------------------------------------------------------------------] 8. VIII. Concurrency
|
||||
# [--------------------------------------------------------------------] 9. IX. Disposability
|
||||
# [--------------------------------------------------------------------] 10. X. Dev/prod parity
|
||||
# [--------------------------------------------------------------------] 11. XI. Logs
|
||||
# [--------------------------------------------------------------------] 12. XII. Admin processes
|
||||
# Markdown saved to "The_Twelve-Factor_App.md"
|
||||
```
|
||||
|
||||
# Installation
|
||||
|
||||
@@ -1,13 +1,20 @@
|
||||
package book
|
||||
|
||||
type chapter struct {
|
||||
name string
|
||||
author string
|
||||
content string
|
||||
body string
|
||||
name string
|
||||
author string
|
||||
content string
|
||||
subChapters []chapter
|
||||
config *ScrapeConfig
|
||||
}
|
||||
|
||||
func NewChapter(name, author, content string) chapter {
|
||||
return chapter{name, author, content}
|
||||
func NewChapter(body, name, author, content string, subChapters []chapter, config *ScrapeConfig) chapter {
|
||||
return chapter{body, name, author, content, subChapters, config}
|
||||
}
|
||||
|
||||
func (c chapter) Body() string {
|
||||
return c.body
|
||||
}
|
||||
|
||||
func (c chapter) Name() string {
|
||||
@@ -21,3 +28,7 @@ func (c chapter) Author() string {
|
||||
func (c chapter) Content() string {
|
||||
return c.content
|
||||
}
|
||||
|
||||
func (c chapter) SubChapters() []chapter {
|
||||
return c.subChapters
|
||||
}
|
||||
|
||||
90
book/format.go
Normal file
90
book/format.go
Normal file
@@ -0,0 +1,90 @@
|
||||
package book
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"strings"
|
||||
|
||||
md "github.com/JohannesKaufmann/html-to-markdown"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
epub "github.com/bmaupin/go-epub"
|
||||
)
|
||||
|
||||
func ToMarkdown(c chapter) string {
|
||||
|
||||
// make title
|
||||
underline := strings.Repeat("=", len(c.Name()))
|
||||
title := fmt.Sprintf("%s\n%s", c.Name(), underline)
|
||||
|
||||
// convert content to markdown
|
||||
content, err := md.NewConverter("", true, nil).ConvertString(c.Content())
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
// merge title and content
|
||||
content = fmt.Sprintf("%s\n\n%s", title, content)
|
||||
|
||||
for _, sc := range c.SubChapters() {
|
||||
// merge subchapters
|
||||
content = fmt.Sprintf("%s\n\n\n%s", content, ToMarkdown(sc))
|
||||
}
|
||||
|
||||
return content
|
||||
}
|
||||
|
||||
func ToEpub(c chapter, filename string) {
|
||||
if len(filename) == 0 {
|
||||
filename = fmt.Sprintf("%s.epub", c.Name())
|
||||
}
|
||||
|
||||
// init ebook
|
||||
e := epub.NewEpub(c.Name())
|
||||
e.SetAuthor(c.Author())
|
||||
|
||||
AppendToEpub(e, c, false)
|
||||
|
||||
err := e.Write(filename)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
fmt.Printf("Ebook saved to \"%s\"\n", filename)
|
||||
}
|
||||
|
||||
func AppendToEpub(e *epub.Epub, c chapter, imagesOnly bool) {
|
||||
content := ""
|
||||
|
||||
if imagesOnly == false {
|
||||
content = c.Content()
|
||||
}
|
||||
|
||||
// parse content
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(c.Content()))
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
// download images and replace src in img tags of content
|
||||
doc.Find("img").Each(func(i int, s *goquery.Selection) {
|
||||
src, _ := s.Attr("src")
|
||||
imagePath, _ := e.AddImage(src, "")
|
||||
|
||||
if imagesOnly {
|
||||
imageTag, _ := goquery.OuterHtml(s)
|
||||
content += strings.Replace(imageTag, src, imagePath, 1)
|
||||
} else {
|
||||
content = strings.Replace(content, src, imagePath, 1)
|
||||
}
|
||||
})
|
||||
|
||||
html := fmt.Sprintf("<h1>%s</h1>%s", c.Name(), content)
|
||||
_, err = e.AddSection(html, c.Name(), "", "")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
for _, sc := range c.SubChapters() {
|
||||
AppendToEpub(e, sc, false)
|
||||
}
|
||||
}
|
||||
49
book/format_test.go
Normal file
49
book/format_test.go
Normal file
@@ -0,0 +1,49 @@
|
||||
package book
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"os"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestToMarkdown(t *testing.T) {
|
||||
|
||||
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()})
|
||||
|
||||
got := ToMarkdown(c)
|
||||
want := "Books\n=====\n\n- [Discours de la Méthode](https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/)clock 98 min read -\n1637\n\n- [The Twelve-Factor App](https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/)clock 22 min read -\n2011"
|
||||
|
||||
if got != want {
|
||||
t.Errorf("got %q, wanted %q", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestToEpub(t *testing.T) {
|
||||
|
||||
filename := "ebook.epub"
|
||||
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()})
|
||||
ToEpub(c, filename)
|
||||
|
||||
if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) {
|
||||
t.Errorf("%s does not exist: %v", filename, err)
|
||||
} else {
|
||||
if err := os.Remove(filename); err != nil {
|
||||
t.Errorf("cannot remove %v: %v", filename, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestToEpubNoFilename(t *testing.T) {
|
||||
|
||||
filename := "Books.epub"
|
||||
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()})
|
||||
ToEpub(c, "")
|
||||
|
||||
if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) {
|
||||
t.Errorf("%s does not exist: %v", filename, err)
|
||||
} else {
|
||||
if err := os.Remove(filename); err != nil {
|
||||
t.Errorf("cannot remove %v: %v", filename, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
120
book/scraper.go
120
book/scraper.go
@@ -1,9 +1,12 @@
|
||||
package book
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"math"
|
||||
"net/http"
|
||||
urllib "net/url"
|
||||
"strings"
|
||||
"sync"
|
||||
@@ -14,14 +17,33 @@ import (
|
||||
colly "github.com/gocolly/colly/v2"
|
||||
)
|
||||
|
||||
func NewBookFromURL(url, selector, name, author string, recursive, include bool, limit, offset, delay, threads int) book {
|
||||
type ScrapeConfig struct {
|
||||
selector string
|
||||
limit int
|
||||
include bool
|
||||
imagesOnly bool
|
||||
}
|
||||
|
||||
func NewScrapeConfig() *ScrapeConfig {
|
||||
return &ScrapeConfig{"", -1, true, false}
|
||||
}
|
||||
|
||||
func NewBookFromURL(url, selector, name, author string, recursive, include, imagesOnly bool, limit, offset, delay, threads int) book {
|
||||
config1 := NewScrapeConfig()
|
||||
config1.imagesOnly = imagesOnly
|
||||
|
||||
var chapters []chapter
|
||||
var home chapter
|
||||
|
||||
if recursive {
|
||||
chapters, home = tableOfContent(url, selector, limit, offset, delay, threads, include)
|
||||
config2 := NewScrapeConfig()
|
||||
config2.selector = selector
|
||||
config2.limit = limit
|
||||
config2.include = include
|
||||
config2.imagesOnly = imagesOnly
|
||||
chapters, home = tableOfContent(url, config1.selector, config1.limit, offset, delay, threads, config1.include)
|
||||
} else {
|
||||
chapters = []chapter{NewChapterFromURL(url)}
|
||||
chapters = []chapter{NewChapterFromURL(url, []*ScrapeConfig{config1})}
|
||||
home = chapters[0]
|
||||
}
|
||||
|
||||
@@ -41,30 +63,82 @@ func NewBookFromURL(url, selector, name, author string, recursive, include bool,
|
||||
return b
|
||||
}
|
||||
|
||||
func NewChapterFromURL(url string) chapter {
|
||||
article, err := readability.FromURL(url, 30*time.Second)
|
||||
func NewChapterFromURL(url string, configs []*ScrapeConfig) chapter {
|
||||
config := configs[0]
|
||||
content := ""
|
||||
|
||||
base, err := urllib.Parse(url)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
subchapters := []chapter{}
|
||||
if len(configs) > 1 {
|
||||
// add subchapters
|
||||
|
||||
links, _, err := GetLinks(base, config.selector, config.limit, 0, false)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
for _, link := range links {
|
||||
// and then use it to parse relative URLs
|
||||
u, err := base.Parse(link.href)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
subchapters = append(subchapters, NewChapterFromURL(u.String(), configs[1:]))
|
||||
}
|
||||
}
|
||||
|
||||
// we want the metadata anyway
|
||||
|
||||
// get page body
|
||||
response, err := http.Get(url)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer response.Body.Close()
|
||||
|
||||
// duplicate response stream
|
||||
readabilityReader := &bytes.Buffer{}
|
||||
bodyReader := io.TeeReader(response.Body, readabilityReader)
|
||||
|
||||
// extract HTML body
|
||||
body, err := io.ReadAll(bodyReader)
|
||||
|
||||
// extract content
|
||||
article, err := readability.FromReader(readabilityReader, base)
|
||||
if err != nil {
|
||||
log.Fatalf("failed to parse %s, %v\n", url, err)
|
||||
}
|
||||
|
||||
content := strings.ReplaceAll(article.Content, "\n", "")
|
||||
// we don't care about the content if we do not include this level
|
||||
|
||||
// if images {
|
||||
// // parse html content
|
||||
// doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
|
||||
// if err != nil {
|
||||
// log.Fatal(err)
|
||||
// }
|
||||
if config.include {
|
||||
content = article.Content
|
||||
|
||||
// // extract images only
|
||||
// content = ""
|
||||
// doc.Find("img").Each(func(i int, s *goquery.Selection) {
|
||||
// newContent, _ := goquery.OuterHtml(s)
|
||||
// content += newContent
|
||||
// })
|
||||
// }
|
||||
// extract images
|
||||
if config.imagesOnly {
|
||||
|
||||
return chapter{article.Title, article.Byline, content}
|
||||
// parse HTML
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
// append every image to content
|
||||
content = ""
|
||||
doc.Find("img").Each(func(i int, s *goquery.Selection) {
|
||||
imageTag, _ := goquery.OuterHtml(s)
|
||||
content += imageTag
|
||||
})
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return chapter{string(body), article.Title, article.Byline, content, subchapters, config}
|
||||
}
|
||||
|
||||
func tableOfContent(url, selector string, limit, offset, delay, threads int, include bool) ([]chapter, chapter) {
|
||||
@@ -91,7 +165,7 @@ func tableOfContent(url, selector string, limit, offset, delay, threads int, inc
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
chapters[index] = NewChapterFromURL(u.String())
|
||||
chapters[index] = NewChapterFromURL(u.String(), []*ScrapeConfig{NewScrapeConfig()})
|
||||
progress.Incr(index)
|
||||
|
||||
// short sleep for last chapter to let the progress bar update
|
||||
@@ -125,7 +199,7 @@ func tableOfContent(url, selector string, limit, offset, delay, threads int, inc
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
chapters[index] = NewChapterFromURL(u.String())
|
||||
chapters[index] = NewChapterFromURL(u.String(), []*ScrapeConfig{NewScrapeConfig()})
|
||||
progress.Incr(index)
|
||||
|
||||
<-semaphore
|
||||
@@ -202,7 +276,7 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool)
|
||||
|
||||
links = links[offset:end]
|
||||
|
||||
home := NewChapterFromURL(url.String())
|
||||
home := NewChapterFromURL(url.String(), []*ScrapeConfig{NewScrapeConfig()})
|
||||
|
||||
if include {
|
||||
l := NewLink(url.String(), home.Name())
|
||||
|
||||
131
book/scraper_test.go
Normal file
131
book/scraper_test.go
Normal file
@@ -0,0 +1,131 @@
|
||||
package book
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestBody(t *testing.T) {
|
||||
|
||||
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()})
|
||||
|
||||
got := c.Body()
|
||||
want := "<!doctype html>\n<html lang=\"en-us\">\n <head>\n <title>Books</title>\n <link rel=\"shortcut icon\" href=\"/favicon.ico\" />\n <meta charset=\"utf-8\" />\n <meta name=\"generator\" content=\"Hugo 0.59.1\" />\n <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />\n <meta name=\"author\" content=\"John Doe\" />\n <meta name=\"description\" content=\" \" />\n <link rel=\"stylesheet\" href=\"https://books.lapw.at/css/main.min.88e7083eff65effb7485b6e6f38d10afbec25093a6fac42d734ce9024d3defbd.css\" />\n\n \n <meta name=\"twitter:card\" content=\"summary\"/>\n<meta name=\"twitter:title\" content=\"Books\"/>\n<meta name=\"twitter:description\" content=\" \"/>\n\n <meta property=\"og:title\" content=\"Books\" />\n<meta property=\"og:description\" content=\" \" />\n<meta property=\"og:type\" content=\"website\" />\n<meta property=\"og:url\" content=\"https://books.lapw.at/\" />\n\n\n\n </head>\n <body>\n <header class=\"app-header\">\n <a href=\"https://books.lapw.at/\"><img class=\"app-header-avatar\" src=\"/book.svg\" alt=\"John Doe\" /></a>\n <h1>Books</h1>\n <p> </p>\n <div class=\"app-header-social\">\n \n </div>\n </header>\n <main class=\"app-container\">\n \n <article>\n <h1>Books</h1>\n <ul class=\"posts-list\">\n \n <li class=\"posts-list-item\">\n <a class=\"posts-list-item-title\" href=\"https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/\">Discours de la Méthode</a>\n <span class=\"posts-list-item-description\">\n <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\" class=\"icon icon-clock\">\n <title>clock</title>\n <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 98 min read -\n 1637\n </span>\n </li>\n \n <li class=\"posts-list-item\">\n <a class=\"posts-list-item-title\" href=\"https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/\">The Twelve-Factor App</a>\n <span class=\"posts-list-item-description\">\n <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\" class=\"icon icon-clock\">\n <title>clock</title>\n <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 22 min read -\n 2011\n </span>\n </li>\n \n </ul>\n \n\n\n\n </article>\n\n </main>\n </body>\n</html>\n"
|
||||
|
||||
if got != want {
|
||||
t.Errorf("got %v, wanted %v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestName(t *testing.T) {
|
||||
|
||||
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()})
|
||||
|
||||
got := c.Name()
|
||||
want := "Books"
|
||||
|
||||
if got != want {
|
||||
t.Errorf("got %v, wanted %v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAuthor(t *testing.T) {
|
||||
|
||||
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()})
|
||||
|
||||
got := c.Author()
|
||||
want := "John Doe"
|
||||
|
||||
if got != want {
|
||||
t.Errorf("got %v, wanted %v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestContent(t *testing.T) {
|
||||
|
||||
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()})
|
||||
|
||||
got := c.Content()
|
||||
want := "<div id=\"readability-page-1\" class=\"page\">\n \n <main>\n \n <article>\n \n <ul>\n \n <li>\n <a href=\"https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/\">Discours de la Méthode</a>\n <span>\n <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\">\n <title>clock</title>\n <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 98 min read -\n 1637\n </span>\n </li>\n \n <li>\n <a href=\"https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/\">The Twelve-Factor App</a>\n <span>\n <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\">\n <title>clock</title>\n <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 22 min read -\n 2011\n </span>\n </li>\n \n </ul>\n \n\n\n\n </article>\n\n </main>\n \n\n</div>"
|
||||
|
||||
if got != want {
|
||||
t.Errorf("got %v, wanted %v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestContentImagesOnly(t *testing.T) {
|
||||
|
||||
config := NewScrapeConfig()
|
||||
config.imagesOnly = true
|
||||
c := NewChapterFromURL("https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/", []*ScrapeConfig{config})
|
||||
|
||||
got := c.Content()
|
||||
want := "<img src=\"https://books.lapw.at/images/codebase-deploys.png\" alt=\"One codebase maps to many deploys\"/><img src=\"https://books.lapw.at/images/attached-resources.png\" alt=\"A production deploy attached to four backing services.\"/><img src=\"https://books.lapw.at/images/release.png\" alt=\"Code becomes a build, which is combined with config to create a release.\"/><img src=\"https://books.lapw.at/images/process-types.png\" alt=\"Scale is expressed as running processes, workload diversity is expressed as process types.\"/>"
|
||||
|
||||
if got != want {
|
||||
t.Errorf("got %v, wanted %v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSubChapters(t *testing.T) {
|
||||
|
||||
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig(), NewScrapeConfig()})
|
||||
|
||||
got := len(c.SubChapters())
|
||||
want := 2
|
||||
|
||||
if got != want {
|
||||
t.Errorf("got %v, wanted %v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSubChaptersSelector(t *testing.T) {
|
||||
|
||||
c := NewChapterFromURL("https://12factor.net/", []*ScrapeConfig{{"section.concrete > article > h2 > a", -1, true, false}, NewScrapeConfig()})
|
||||
|
||||
got := len(c.SubChapters())
|
||||
want := 12
|
||||
|
||||
if got != want {
|
||||
t.Errorf("got %v, wanted %v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSubChaptersLimit(t *testing.T) {
|
||||
|
||||
config := NewScrapeConfig()
|
||||
config.limit = 1
|
||||
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config, NewScrapeConfig()})
|
||||
|
||||
got := len(c.SubChapters())
|
||||
want := 1
|
||||
|
||||
if got != want {
|
||||
t.Errorf("got %v, wanted %v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSubChaptersLimitOver(t *testing.T) {
|
||||
|
||||
config := NewScrapeConfig()
|
||||
config.limit = 3
|
||||
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config, NewScrapeConfig()})
|
||||
|
||||
got := len(c.SubChapters())
|
||||
want := 2
|
||||
|
||||
if got != want {
|
||||
t.Errorf("got %v, wanted %v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNotInclude(t *testing.T) {
|
||||
|
||||
config := NewScrapeConfig()
|
||||
config.include = false
|
||||
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config})
|
||||
|
||||
got := c.Content()
|
||||
want := ""
|
||||
|
||||
if got != want {
|
||||
t.Errorf("got %v, wanted %v", got, want)
|
||||
}
|
||||
}
|
||||
@@ -77,7 +77,7 @@ var getCmd = &cobra.Command{
|
||||
},
|
||||
Run: func(cmd *cobra.Command, args []string) {
|
||||
url := args[0]
|
||||
b := book.NewBookFromURL(url, selector, name, author, recursive, include, limit, offset, delay, threads)
|
||||
b := book.NewBookFromURL(url, selector, name, author, recursive, include, images, limit, offset, delay, threads)
|
||||
|
||||
if len(output) == 0 {
|
||||
// set default output
|
||||
|
||||
@@ -3,12 +3,18 @@
|
||||
version=$1
|
||||
platforms=("linux/amd64" "darwin/amd64" "windows/amd64")
|
||||
|
||||
if [ "$#" -ne 1 ]; then
|
||||
echo "Illegal number of parameters"
|
||||
echo "Usage: ./release.sh X.X.X"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
for platform in "${platforms[@]}"
|
||||
do
|
||||
platform_split=(${platform//\// })
|
||||
GOOS=${platform_split[0]}
|
||||
GOARCH=${platform_split[1]}
|
||||
output_name='papeer-'$version'-'$GOOS'-'$GOARCH
|
||||
output_name='papeer-v'$version'-'$GOOS'-'$GOARCH
|
||||
if [ $GOOS = "windows" ]; then
|
||||
output_name+='.exe'
|
||||
fi
|
||||
|
||||
Reference in New Issue
Block a user