mirror of
https://github.com/NohamR/papeer.git
synced 2026-05-25 20:00:47 +00:00
chain selctors, depth & quiet options, split main commands
This commit is contained in:
@@ -21,10 +21,10 @@ func Filename(name string) string {
|
||||
return filename
|
||||
}
|
||||
|
||||
func ToMarkdown(c chapter) string {
|
||||
func ToMarkdownString(c chapter) string {
|
||||
markdown := ""
|
||||
|
||||
if c.config.include {
|
||||
if c.config.Include {
|
||||
// title
|
||||
markdown += fmt.Sprintf("%s\n", c.Name())
|
||||
markdown += fmt.Sprintf("%s\n\n", strings.Repeat("=", len(c.Name())))
|
||||
@@ -39,12 +39,33 @@ func ToMarkdown(c chapter) string {
|
||||
|
||||
for _, sc := range c.SubChapters() {
|
||||
// subchapters content
|
||||
markdown += fmt.Sprintf("%s\n\n\n", ToMarkdown(sc))
|
||||
markdown += fmt.Sprintf("%s\n\n\n", ToMarkdownString(sc))
|
||||
}
|
||||
|
||||
return markdown
|
||||
}
|
||||
|
||||
func ToMarkdown(c chapter, filename string) string {
|
||||
if len(filename) == 0 {
|
||||
filename = fmt.Sprintf("%s.md", Filename(c.Name()))
|
||||
}
|
||||
|
||||
markdown := ToMarkdownString(c)
|
||||
|
||||
// write to file
|
||||
f, err := os.Create(filename)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
_, err2 := f.WriteString(markdown)
|
||||
if err2 != nil {
|
||||
log.Fatal(err2)
|
||||
}
|
||||
f.Close()
|
||||
|
||||
return filename
|
||||
}
|
||||
|
||||
func ToEpub(c chapter, filename string) string {
|
||||
if len(filename) == 0 {
|
||||
filename = fmt.Sprintf("%s.epub", Filename(c.Name()))
|
||||
@@ -67,9 +88,9 @@ func ToEpub(c chapter, filename string) string {
|
||||
func AppendToEpub(e *epub.Epub, c chapter) {
|
||||
content := ""
|
||||
|
||||
if c.config.include {
|
||||
if c.config.Include {
|
||||
|
||||
if c.config.imagesOnly == false {
|
||||
if c.config.ImagesOnly == false {
|
||||
content = c.Content()
|
||||
}
|
||||
|
||||
@@ -85,7 +106,7 @@ func AppendToEpub(e *epub.Epub, c chapter) {
|
||||
src = strings.Split(src, "?")[0] // remove query part
|
||||
imagePath, _ := e.AddImage(src, "")
|
||||
|
||||
if c.config.imagesOnly {
|
||||
if c.config.ImagesOnly {
|
||||
imageTag, _ := goquery.OuterHtml(s)
|
||||
content += strings.Replace(imageTag, src, imagePath, 1)
|
||||
} else {
|
||||
@@ -94,8 +115,8 @@ func AppendToEpub(e *epub.Epub, c chapter) {
|
||||
})
|
||||
|
||||
html := ""
|
||||
// add title only if imagesOnly = false
|
||||
if c.config.imagesOnly == false {
|
||||
// add title only if ImagesOnly = false
|
||||
if c.config.ImagesOnly == false {
|
||||
html += fmt.Sprintf("<h1>%s</h1>", c.Name())
|
||||
}
|
||||
html += content
|
||||
|
||||
@@ -17,11 +17,11 @@ func TestFilename(t *testing.T) {
|
||||
|
||||
}
|
||||
|
||||
func TestToMarkdown(t *testing.T) {
|
||||
func TestToMarkdownString(t *testing.T) {
|
||||
|
||||
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
|
||||
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
|
||||
|
||||
got := ToMarkdown(c)
|
||||
got := ToMarkdownString(c)
|
||||
want := "Books\n=====\n\n- [Discours de la Méthode](https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/)clock 98 min read -\n1637\n\n- [The Twelve-Factor App](https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/)clock 22 min read -\n2011\n\n\n"
|
||||
|
||||
if got != want {
|
||||
@@ -30,12 +30,44 @@ func TestToMarkdown(t *testing.T) {
|
||||
|
||||
}
|
||||
|
||||
func TestToMarkdown(t *testing.T) {
|
||||
|
||||
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
|
||||
ToMarkdown(c, "")
|
||||
|
||||
filename := "Books.md"
|
||||
if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) {
|
||||
t.Errorf("%s does not exist: %v", filename, err)
|
||||
} else {
|
||||
if err := os.Remove(filename); err != nil {
|
||||
t.Errorf("cannot remove %v: %v", filename, err)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func TestToMarkdownFilename(t *testing.T) {
|
||||
|
||||
filename := "ebook.md"
|
||||
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
|
||||
ToMarkdown(c, filename)
|
||||
|
||||
if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) {
|
||||
t.Errorf("%s does not exist: %v", filename, err)
|
||||
} else {
|
||||
if err := os.Remove(filename); err != nil {
|
||||
t.Errorf("cannot remove %v: %v", filename, err)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func TestToEpub(t *testing.T) {
|
||||
|
||||
filename := "Books.epub"
|
||||
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
|
||||
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
|
||||
ToEpub(c, "")
|
||||
|
||||
filename := "Books.epub"
|
||||
if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) {
|
||||
t.Errorf("%s does not exist: %v", filename, err)
|
||||
} else {
|
||||
@@ -49,7 +81,7 @@ func TestToEpub(t *testing.T) {
|
||||
func TestToEpubFilename(t *testing.T) {
|
||||
|
||||
filename := "ebook.epub"
|
||||
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
|
||||
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
|
||||
ToEpub(c, filename)
|
||||
|
||||
if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) {
|
||||
@@ -65,7 +97,23 @@ func TestToEpubFilename(t *testing.T) {
|
||||
func TestToMobi(t *testing.T) {
|
||||
|
||||
filename := "ebook.mobi"
|
||||
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
|
||||
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
|
||||
ToMobi(c, filename)
|
||||
|
||||
if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) {
|
||||
t.Errorf("%s does not exist: %v", filename, err)
|
||||
} else {
|
||||
if err := os.Remove(filename); err != nil {
|
||||
t.Errorf("cannot remove %v: %v", filename, err)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func TestToMobiFilename(t *testing.T) {
|
||||
|
||||
filename := "ebook.mobi"
|
||||
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
|
||||
ToMobi(c, filename)
|
||||
|
||||
if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) {
|
||||
|
||||
205
book/scraper.go
205
book/scraper.go
@@ -18,83 +18,100 @@ import (
|
||||
)
|
||||
|
||||
type ScrapeConfig struct {
|
||||
depth int
|
||||
selector string
|
||||
limit int
|
||||
offset int
|
||||
delay int
|
||||
threads int
|
||||
include bool
|
||||
imagesOnly bool
|
||||
Depth int
|
||||
Selector string
|
||||
Quiet bool
|
||||
Limit int
|
||||
Offset int
|
||||
Delay int
|
||||
Threads int
|
||||
Include bool
|
||||
ImagesOnly bool
|
||||
UseLinkName bool
|
||||
}
|
||||
|
||||
func NewScrapeConfig() *ScrapeConfig {
|
||||
return &ScrapeConfig{0, "", -1, 0, -1, -1, true, false}
|
||||
return &ScrapeConfig{0, "", false, -1, 0, -1, -1, true, false, false}
|
||||
}
|
||||
|
||||
func NewScrapeConfigs(selectors []string) []*ScrapeConfig {
|
||||
configs := []*ScrapeConfig{}
|
||||
|
||||
for _, s := range selectors {
|
||||
config := NewScrapeConfig()
|
||||
config.Selector = s
|
||||
|
||||
configs = append(configs, config)
|
||||
}
|
||||
|
||||
return configs
|
||||
}
|
||||
|
||||
func NewScrapeConfigsAjin() []*ScrapeConfig {
|
||||
config0 := NewScrapeConfig()
|
||||
config0.depth = 0
|
||||
config0.selector = ".dt>a"
|
||||
config0.limit = 3
|
||||
config0.offset = 0
|
||||
config0.delay = 5000
|
||||
config0.include = false
|
||||
config0.Depth = 0
|
||||
config0.Selector = ".dt>a"
|
||||
config0.Limit = 3
|
||||
config0.Offset = 0
|
||||
config0.Delay = 5000
|
||||
config0.Include = false
|
||||
|
||||
config1 := NewScrapeConfig()
|
||||
config1.depth = 1
|
||||
config1.selector = ".nav_apb>a"
|
||||
config1.limit = 3
|
||||
config1.offset = 1
|
||||
config1.delay = 5000
|
||||
config1.include = false
|
||||
config1.Depth = 1
|
||||
config1.Selector = ".nav_apb>a"
|
||||
config1.Limit = 3
|
||||
config1.Offset = 1
|
||||
config1.Delay = 5000
|
||||
config1.Include = false
|
||||
|
||||
config2 := NewScrapeConfig()
|
||||
config2.depth = 2
|
||||
config2.imagesOnly = true
|
||||
config2.Depth = 2
|
||||
config2.ImagesOnly = true
|
||||
|
||||
return []*ScrapeConfig{config0, config1, config2}
|
||||
}
|
||||
|
||||
func NewScrapeConfigsWikipedia() []*ScrapeConfig {
|
||||
config0 := NewScrapeConfig()
|
||||
config0.depth = 0
|
||||
config0.threads = -1
|
||||
config0.include = true
|
||||
config0.Depth = 0
|
||||
config0.Threads = -1
|
||||
config0.Include = true
|
||||
|
||||
config1 := NewScrapeConfig()
|
||||
config1.depth = 1
|
||||
config1.include = true
|
||||
config1.Depth = 1
|
||||
config1.Include = true
|
||||
|
||||
return []*ScrapeConfig{config0, config1}
|
||||
}
|
||||
|
||||
func NewScrapeConfigFake() *ScrapeConfig {
|
||||
config := NewScrapeConfig()
|
||||
config.include = false
|
||||
config.Include = false
|
||||
|
||||
return config
|
||||
}
|
||||
|
||||
func NewBookFromURL(url, selector, name, author string, recursive, include, imagesOnly, quiet bool, limit, offset, delay, threads int) book {
|
||||
func NewBookFromURL(url string, selector []string, name, author string, include, ImagesOnly, useLinkName, quiet bool, limit, offset, delay, threads int) book {
|
||||
config1 := NewScrapeConfig()
|
||||
config1.imagesOnly = imagesOnly
|
||||
config1.ImagesOnly = ImagesOnly
|
||||
config1.UseLinkName = useLinkName
|
||||
|
||||
var chapters []chapter
|
||||
var home chapter
|
||||
|
||||
if recursive {
|
||||
if len(selector) > 0 {
|
||||
config2 := NewScrapeConfig()
|
||||
config2.selector = selector
|
||||
config2.limit = limit
|
||||
config2.offset = offset
|
||||
config2.delay = delay
|
||||
config2.threads = threads
|
||||
config2.include = include
|
||||
config2.imagesOnly = imagesOnly
|
||||
config2.Selector = selector[0]
|
||||
config2.Limit = limit
|
||||
config2.Offset = offset
|
||||
config2.Delay = delay
|
||||
config2.Threads = threads
|
||||
config2.Include = include
|
||||
config2.ImagesOnly = ImagesOnly
|
||||
config2.UseLinkName = useLinkName
|
||||
chapters, home = tableOfContent(url, config2, config1, quiet)
|
||||
} else {
|
||||
chapters = []chapter{NewChapterFromURL(url, []*ScrapeConfig{config1}, 0, func(index int, name string) {})}
|
||||
chapters = []chapter{NewChapterFromURL(url, "", []*ScrapeConfig{config1}, 0, func(index int, name string) {})}
|
||||
home = chapters[0]
|
||||
}
|
||||
|
||||
@@ -114,7 +131,7 @@ func NewBookFromURL(url, selector, name, author string, recursive, include, imag
|
||||
return b
|
||||
}
|
||||
|
||||
func NewChapterFromURL(url string, configs []*ScrapeConfig, index int, updateProgressBarName func(index int, name string)) chapter {
|
||||
func NewChapterFromURL(url, linkName string, configs []*ScrapeConfig, index int, updateProgressBarName func(index int, name string)) chapter {
|
||||
config := configs[0]
|
||||
|
||||
base, err := urllib.Parse(url)
|
||||
@@ -141,24 +158,31 @@ func NewChapterFromURL(url string, configs []*ScrapeConfig, index int, updatePro
|
||||
if err != nil {
|
||||
log.Fatalf("failed to parse %s, %v\n", url, err)
|
||||
}
|
||||
name := article.Title
|
||||
|
||||
// notify progress bar with new name
|
||||
updateProgressBarName(index, name)
|
||||
name := linkName
|
||||
if config.UseLinkName == false {
|
||||
name = article.Title
|
||||
|
||||
// notify progressbar with new name
|
||||
updateProgressBarName(index, name)
|
||||
}
|
||||
|
||||
subchapters := []chapter{}
|
||||
if len(configs) > 1 {
|
||||
// add subchapters
|
||||
|
||||
links, _, err := GetLinks(base, config.selector, config.limit, config.offset, false)
|
||||
links, _, _, err := GetLinks(base, config.Selector, config.Limit, config.Offset, false)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
subchapters = make([]chapter, len(links))
|
||||
progress := NewProgress(links, name, config.depth)
|
||||
var p progress
|
||||
if config.Quiet == false {
|
||||
p = NewProgress(links, name, config.Depth)
|
||||
}
|
||||
|
||||
if config.delay >= 0 {
|
||||
if config.Delay >= 0 {
|
||||
|
||||
// synchronous mode
|
||||
for index, link := range links {
|
||||
@@ -168,18 +192,20 @@ func NewChapterFromURL(url string, configs []*ScrapeConfig, index int, updatePro
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
sc := NewChapterFromURL(u.String(), configs[1:], index, progress.UpdateName)
|
||||
sc := NewChapterFromURL(u.String(), link.text, configs[1:], index, p.UpdateName)
|
||||
subchapters[index] = sc
|
||||
progress.Increment(index)
|
||||
if config.Quiet == false {
|
||||
p.Increment(index)
|
||||
}
|
||||
|
||||
time.Sleep(time.Duration(config.delay) * time.Millisecond)
|
||||
time.Sleep(time.Duration(config.Delay) * time.Millisecond)
|
||||
}
|
||||
|
||||
} else {
|
||||
// asynchronous mode
|
||||
var wg sync.WaitGroup
|
||||
|
||||
threads := config.threads
|
||||
threads := config.Threads
|
||||
if threads == -1 {
|
||||
threads = len(links)
|
||||
}
|
||||
@@ -199,9 +225,12 @@ func NewChapterFromURL(url string, configs []*ScrapeConfig, index int, updatePro
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
sc := NewChapterFromURL(u.String(), configs[1:], index, progress.UpdateName)
|
||||
sc := NewChapterFromURL(u.String(), l.text, configs[1:], index, p.UpdateName)
|
||||
subchapters[index] = sc
|
||||
progress.Increment(index)
|
||||
|
||||
if config.Quiet == false {
|
||||
p.Increment(index)
|
||||
}
|
||||
|
||||
<-semaphore
|
||||
}(index, l)
|
||||
@@ -211,13 +240,15 @@ func NewChapterFromURL(url string, configs []*ScrapeConfig, index int, updatePro
|
||||
}
|
||||
|
||||
content := ""
|
||||
if config.include {
|
||||
if config.Include {
|
||||
|
||||
// we care about the content only if we include this level
|
||||
// we care about the content only if:
|
||||
// - we include this level
|
||||
// - we use the page name
|
||||
content = article.Content
|
||||
|
||||
// extract images
|
||||
if config.imagesOnly {
|
||||
if config.ImagesOnly {
|
||||
|
||||
// parse HTML
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
|
||||
@@ -246,13 +277,13 @@ func tableOfContent(url string, config *ScrapeConfig, subConfig *ScrapeConfig, q
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
links, home, err := GetLinks(base, config.selector, config.limit, config.offset, config.include)
|
||||
links, _, home, err := GetLinks(base, config.Selector, config.Limit, config.Offset, config.Include)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
chapters := make([]chapter, len(links))
|
||||
delay := config.delay
|
||||
delay := config.Delay
|
||||
|
||||
var p progress
|
||||
if quiet == false {
|
||||
@@ -262,15 +293,15 @@ func tableOfContent(url string, config *ScrapeConfig, subConfig *ScrapeConfig, q
|
||||
if delay >= 0 {
|
||||
// synchronous mode
|
||||
|
||||
for index, link := range links {
|
||||
for index, l := range links {
|
||||
// and then use it to parse relative URLs
|
||||
u, err := base.Parse(link.href)
|
||||
u, err := base.Parse(l.href)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
chapters[index] = NewChapterFromURL(u.String(), []*ScrapeConfig{subConfig}, 0, func(index int, name string) {})
|
||||
|
||||
chapters[index] = NewChapterFromURL(u.String(), l.text, []*ScrapeConfig{subConfig}, 0, func(index int, name string) {})
|
||||
|
||||
if quiet == false {
|
||||
p.Increment(index)
|
||||
}
|
||||
@@ -287,7 +318,7 @@ func tableOfContent(url string, config *ScrapeConfig, subConfig *ScrapeConfig, q
|
||||
// asynchronous mode
|
||||
var wg sync.WaitGroup
|
||||
|
||||
threads := config.threads
|
||||
threads := config.Threads
|
||||
if threads == -1 {
|
||||
threads = len(links)
|
||||
}
|
||||
@@ -307,7 +338,7 @@ func tableOfContent(url string, config *ScrapeConfig, subConfig *ScrapeConfig, q
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
chapters[index] = NewChapterFromURL(u.String(), []*ScrapeConfig{subConfig}, 0, func(index int, name string) {})
|
||||
chapters[index] = NewChapterFromURL(u.String(), l.text, []*ScrapeConfig{subConfig}, 0, func(index int, name string) {})
|
||||
|
||||
if quiet == false {
|
||||
p.Increment(index)
|
||||
@@ -327,7 +358,7 @@ func GetPath(elm *goquery.Selection) string {
|
||||
|
||||
for {
|
||||
selector := strings.ToLower(goquery.NodeName(elm))
|
||||
if selector == "" {
|
||||
if len(selector) == 0 {
|
||||
break
|
||||
}
|
||||
|
||||
@@ -339,18 +370,18 @@ func GetPath(elm *goquery.Selection) string {
|
||||
return join
|
||||
}
|
||||
|
||||
func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool) ([]link, chapter, error) {
|
||||
func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool) ([]link, string, chapter, error) {
|
||||
selectorSet := true
|
||||
if selector == "" {
|
||||
if len(selector) == 0 {
|
||||
selector = "a"
|
||||
selectorSet = false
|
||||
}
|
||||
|
||||
// visit and count link classes
|
||||
pathLinks := map[string][]link{}
|
||||
pathCount := map[string]int{}
|
||||
pathMax := ""
|
||||
|
||||
// visit and count link classes
|
||||
c := colly.NewCollector()
|
||||
c.OnHTML(selector, func(e *colly.HTMLElement) {
|
||||
href := e.Attr("href")
|
||||
@@ -358,26 +389,40 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool)
|
||||
path := GetPath(e.DOM)
|
||||
key := path
|
||||
|
||||
// include element class in key if selector is set
|
||||
if !selectorSet {
|
||||
class := e.Attr("class")
|
||||
key = fmt.Sprintf("%s.%s", path, class)
|
||||
}
|
||||
if selectorSet {
|
||||
|
||||
if selectorSet || text != "" {
|
||||
// if selector is set, we use the selector specified by the user
|
||||
|
||||
key = selector
|
||||
pathLinks[key] = append(pathLinks[key], NewLink(href, text))
|
||||
pathCount[key] += len(text)
|
||||
pathCount[key] += 1
|
||||
pathMax = key
|
||||
|
||||
if pathCount[key] > pathCount[pathMax] {
|
||||
pathMax = key
|
||||
} else {
|
||||
|
||||
// if selector is not set, we compute the selector ourselves
|
||||
|
||||
class := e.Attr("class")
|
||||
// include the element class to make sure we have the same exact path for every link in the table of content
|
||||
key = fmt.Sprintf("%s.%s", path, class)
|
||||
|
||||
// we count this key if the link text is not empty
|
||||
if text != "" {
|
||||
pathLinks[key] = append(pathLinks[key], NewLink(href, text))
|
||||
pathCount[key] += len(text)
|
||||
|
||||
if pathCount[key] > pathCount[pathMax] {
|
||||
pathMax = key
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
})
|
||||
c.Visit(url.String())
|
||||
|
||||
links := pathLinks[pathMax]
|
||||
if len(links) == 0 {
|
||||
return []link{}, chapter{}, fmt.Errorf("no link found for selector: %s", selector)
|
||||
return []link{}, pathMax, chapter{}, fmt.Errorf("no link found for selector: %s", selector)
|
||||
}
|
||||
|
||||
end := len(links)
|
||||
@@ -387,12 +432,12 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool)
|
||||
|
||||
links = links[offset:end]
|
||||
|
||||
home := NewChapterFromURL(url.String(), []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
|
||||
home := NewChapterFromURL(url.String(), "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
|
||||
|
||||
if include {
|
||||
l := NewLink(url.String(), home.Name())
|
||||
links = append([]link{l}, links...)
|
||||
}
|
||||
|
||||
return links, home, nil
|
||||
return links, pathMax, home, nil
|
||||
}
|
||||
|
||||
@@ -8,7 +8,7 @@ import (
|
||||
func TestBody(t *testing.T) {
|
||||
|
||||
config := NewScrapeConfig()
|
||||
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config}, 0, func(index int, name string) {})
|
||||
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{config}, 0, func(index int, name string) {})
|
||||
|
||||
got := c.Body()
|
||||
want := "<!doctype html>\n<html lang=\"en-us\">\n <head>\n <title>Books</title>\n <link rel=\"shortcut icon\" href=\"/favicon.ico\" />\n <meta charset=\"utf-8\" />\n <meta name=\"generator\" content=\"Hugo 0.59.1\" />\n <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />\n <meta name=\"author\" content=\"John Doe\" />\n <meta name=\"description\" content=\" \" />\n <link rel=\"stylesheet\" href=\"https://books.lapw.at/css/main.min.88e7083eff65effb7485b6e6f38d10afbec25093a6fac42d734ce9024d3defbd.css\" />\n\n \n <meta name=\"twitter:card\" content=\"summary\"/>\n<meta name=\"twitter:title\" content=\"Books\"/>\n<meta name=\"twitter:description\" content=\" \"/>\n\n <meta property=\"og:title\" content=\"Books\" />\n<meta property=\"og:description\" content=\" \" />\n<meta property=\"og:type\" content=\"website\" />\n<meta property=\"og:url\" content=\"https://books.lapw.at/\" />\n\n\n\n </head>\n <body>\n <header class=\"app-header\">\n <a href=\"https://books.lapw.at/\"><img class=\"app-header-avatar\" src=\"/book.svg\" alt=\"John Doe\" /></a>\n <h1>Books</h1>\n <p> </p>\n <div class=\"app-header-social\">\n \n </div>\n </header>\n <main class=\"app-container\">\n \n <article>\n <h1>Books</h1>\n <ul class=\"posts-list\">\n \n <li class=\"posts-list-item\">\n <a class=\"posts-list-item-title\" href=\"https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/\">Discours de la Méthode</a>\n <span class=\"posts-list-item-description\">\n <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\" class=\"icon icon-clock\">\n <title>clock</title>\n <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 98 min read -\n 1637\n </span>\n </li>\n \n <li class=\"posts-list-item\">\n <a class=\"posts-list-item-title\" href=\"https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/\">The Twelve-Factor App</a>\n <span class=\"posts-list-item-description\">\n <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\" class=\"icon icon-clock\">\n <title>clock</title>\n <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 22 min read -\n 2011\n </span>\n </li>\n \n </ul>\n \n\n\n\n </article>\n\n </main>\n </body>\n</html>\n"
|
||||
@@ -22,7 +22,7 @@ func TestBody(t *testing.T) {
|
||||
func TestName(t *testing.T) {
|
||||
|
||||
config := NewScrapeConfig()
|
||||
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config}, 0, func(index int, name string) {})
|
||||
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{config}, 0, func(index int, name string) {})
|
||||
|
||||
got := c.Name()
|
||||
want := "Books"
|
||||
@@ -33,10 +33,25 @@ func TestName(t *testing.T) {
|
||||
|
||||
}
|
||||
|
||||
func TestCustomName(t *testing.T) {
|
||||
|
||||
config := NewScrapeConfig()
|
||||
config.UseLinkName = true
|
||||
c := NewChapterFromURL("https://books.lapw.at/", "Custom Name", []*ScrapeConfig{config}, 0, func(index int, name string) {})
|
||||
|
||||
got := c.Name()
|
||||
want := "Custom Name"
|
||||
|
||||
if got != want {
|
||||
t.Errorf("got %v, wanted %v", got, want)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func TestAuthor(t *testing.T) {
|
||||
|
||||
config := NewScrapeConfig()
|
||||
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config}, 0, func(index int, name string) {})
|
||||
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{config}, 0, func(index int, name string) {})
|
||||
|
||||
got := c.Author()
|
||||
want := "John Doe"
|
||||
@@ -50,7 +65,7 @@ func TestAuthor(t *testing.T) {
|
||||
func TestContent(t *testing.T) {
|
||||
|
||||
config := NewScrapeConfig()
|
||||
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config}, 0, func(index int, name string) {})
|
||||
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{config}, 0, func(index int, name string) {})
|
||||
|
||||
got := c.Content()
|
||||
want := "<div id=\"readability-page-1\" class=\"page\">\n \n <main>\n \n <article>\n \n <ul>\n \n <li>\n <a href=\"https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/\">Discours de la Méthode</a>\n <span>\n <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\">\n <title>clock</title>\n <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 98 min read -\n 1637\n </span>\n </li>\n \n <li>\n <a href=\"https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/\">The Twelve-Factor App</a>\n <span>\n <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\">\n <title>clock</title>\n <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 22 min read -\n 2011\n </span>\n </li>\n \n </ul>\n \n\n\n\n </article>\n\n </main>\n \n\n</div>"
|
||||
@@ -64,12 +79,12 @@ func TestContent(t *testing.T) {
|
||||
func TestDelay(t *testing.T) {
|
||||
|
||||
config0 := NewScrapeConfig()
|
||||
config0.delay = 500
|
||||
config0.Delay = 500
|
||||
|
||||
config1 := NewScrapeConfig()
|
||||
|
||||
start := time.Now()
|
||||
NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {})
|
||||
NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {})
|
||||
elapsed := time.Since(start)
|
||||
|
||||
got := elapsed
|
||||
@@ -84,9 +99,9 @@ func TestDelay(t *testing.T) {
|
||||
func TestContentImagesOnly(t *testing.T) {
|
||||
|
||||
config := NewScrapeConfig()
|
||||
config.imagesOnly = true
|
||||
config.ImagesOnly = true
|
||||
|
||||
c := NewChapterFromURL("https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/", []*ScrapeConfig{config}, 0, func(index int, name string) {})
|
||||
c := NewChapterFromURL("https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/", "", []*ScrapeConfig{config}, 0, func(index int, name string) {})
|
||||
|
||||
got := c.Content()
|
||||
want := "<img src=\"https://books.lapw.at/images/codebase-deploys.png\" alt=\"One codebase maps to many deploys\"/><img src=\"https://books.lapw.at/images/attached-resources.png\" alt=\"A production deploy attached to four backing services.\"/><img src=\"https://books.lapw.at/images/release.png\" alt=\"Code becomes a build, which is combined with config to create a release.\"/><img src=\"https://books.lapw.at/images/process-types.png\" alt=\"Scale is expressed as running processes, workload diversity is expressed as process types.\"/>"
|
||||
@@ -102,7 +117,7 @@ func TestSubChapters(t *testing.T) {
|
||||
config0 := NewScrapeConfig()
|
||||
config1 := NewScrapeConfig()
|
||||
|
||||
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {})
|
||||
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {})
|
||||
|
||||
got := len(c.SubChapters())
|
||||
want := 2
|
||||
@@ -116,11 +131,11 @@ func TestSubChapters(t *testing.T) {
|
||||
func TestSubChaptersSelector(t *testing.T) {
|
||||
|
||||
config0 := NewScrapeConfig()
|
||||
config0.selector = "section.concrete > article > h2 > a"
|
||||
config0.Selector = "section.concrete > article > h2 > a"
|
||||
|
||||
config1 := NewScrapeConfig()
|
||||
|
||||
c := NewChapterFromURL("https://12factor.net/", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {})
|
||||
c := NewChapterFromURL("https://12factor.net/", "", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {})
|
||||
|
||||
got := len(c.SubChapters())
|
||||
want := 12
|
||||
@@ -134,11 +149,11 @@ func TestSubChaptersSelector(t *testing.T) {
|
||||
func TestSubChaptersLimit(t *testing.T) {
|
||||
|
||||
config0 := NewScrapeConfig()
|
||||
config0.limit = 1
|
||||
config0.Limit = 1
|
||||
|
||||
config1 := NewScrapeConfig()
|
||||
|
||||
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {})
|
||||
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {})
|
||||
|
||||
got := len(c.SubChapters())
|
||||
want := 1
|
||||
@@ -152,11 +167,11 @@ func TestSubChaptersLimit(t *testing.T) {
|
||||
func TestSubChaptersLimitOver(t *testing.T) {
|
||||
|
||||
config0 := NewScrapeConfig()
|
||||
config0.limit = 3
|
||||
config0.Limit = 3
|
||||
|
||||
config1 := NewScrapeConfig()
|
||||
|
||||
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {})
|
||||
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {})
|
||||
|
||||
got := len(c.SubChapters())
|
||||
want := 2
|
||||
@@ -170,9 +185,9 @@ func TestSubChaptersLimitOver(t *testing.T) {
|
||||
func TestNotInclude(t *testing.T) {
|
||||
|
||||
config := NewScrapeConfig()
|
||||
config.include = false
|
||||
config.Include = false
|
||||
|
||||
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config}, 0, func(index int, name string) {})
|
||||
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{config}, 0, func(index int, name string) {})
|
||||
|
||||
got := c.Content()
|
||||
want := ""
|
||||
|
||||
Reference in New Issue
Block a user