add html format, handle lazy loading images

This commit is contained in:
lapwat
2022-08-09 18:21:18 +02:00
parent 97e7d7a5bb
commit d73ae0a73b
8 changed files with 137 additions and 19 deletions

View File

@@ -24,6 +24,7 @@ func Filename(name string) string {
func ToMarkdownString(c chapter) string {
markdown := ""
// chapter content
if c.config.Include {
// title
markdown += fmt.Sprintf("%s\n", c.Name())
@@ -37,8 +38,8 @@ func ToMarkdownString(c chapter) string {
markdown += fmt.Sprintf("%s\n\n\n", content)
}
// subchapters content
for _, sc := range c.SubChapters() {
// subchapters content
markdown += fmt.Sprintf("%s\n\n\n", ToMarkdownString(sc))
}
@@ -66,6 +67,44 @@ func ToMarkdown(c chapter, filename string) string {
return filename
}
func ToHtmlString(c chapter) string {
html := ""
// chapter content
if c.config.Include {
html += fmt.Sprintf("<h1>%s</h1>", c.Name())
html += c.Content()
}
// subchapters content
for _, sc := range c.SubChapters() {
html += ToHtmlString(sc)
}
return html
}
func ToHtml(c chapter, filename string) string {
if len(filename) == 0 {
filename = fmt.Sprintf("%s.html", Filename(c.Name()))
}
html := fmt.Sprintf("<html><head></head><body>%s</body></html>", ToHtmlString(c))
// write to file
f, err := os.Create(filename)
if err != nil {
log.Fatal(err)
}
_, err2 := f.WriteString(html)
if err2 != nil {
log.Fatal(err2)
}
f.Close()
return filename
}
func ToEpub(c chapter, filename string) string {
if len(filename) == 0 {
filename = fmt.Sprintf("%s.epub", Filename(c.Name()))
@@ -88,6 +127,7 @@ func ToEpub(c chapter, filename string) string {
func AppendToEpub(e *epub.Epub, c chapter) {
content := ""
// chapter content
if c.config.Include {
if c.config.ImagesOnly == false {
@@ -129,6 +169,7 @@ func AppendToEpub(e *epub.Epub, c chapter) {
}
// subchapters content
for _, sc := range c.SubChapters() {
AppendToEpub(e, sc)
}

View File

@@ -22,7 +22,7 @@ func TestToMarkdownString(t *testing.T) {
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
got := ToMarkdownString(c)
want := "Books\n=====\n\n- [Discours de la Méthode](https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/)clock 98 min read -\n1637\n\n- [The Twelve-Factor App](https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/)clock 22 min read -\n2011\n\n\n"
want := "Books\n=====\n\n- [Discours de la Méthode](https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/)clock 98 min read -\n 1637\n\n- [The Twelve-Factor App](https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/)clock 22 min read -\n 2011\n\n\n"
if got != want {
t.Errorf("got %q, wanted %q", got, want)
@@ -62,6 +62,51 @@ func TestToMarkdownFilename(t *testing.T) {
}
func TestToHtmlString(t *testing.T) {
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
got := ToHtmlString(c)
want := "<h1>Books</h1>\n \n <main>\n \n <article>\n \n <ul>\n \n <li>\n <a href=\"https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/\">Discours de la Méthode</a>\n <span>\n <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\">\n <title>clock</title>\n <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 98 min read -\n 1637\n </span>\n </li>\n \n <li>\n <a href=\"https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/\">The Twelve-Factor App</a>\n <span>\n <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\">\n <title>clock</title>\n <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 22 min read -\n 2011\n </span>\n </li>\n \n </ul>\n \n\n\n\n </article>\n\n </main>\n \n\n"
if got != want {
t.Errorf("got %q, wanted %q", got, want)
}
}
func TestToHtml(t *testing.T) {
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
ToHtml(c, "")
filename := "Books.html"
if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) {
t.Errorf("%s does not exist: %v", filename, err)
} else {
if err := os.Remove(filename); err != nil {
t.Errorf("cannot remove %v: %v", filename, err)
}
}
}
func TestToHtmlFilename(t *testing.T) {
filename := "ebook.html"
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
ToHtml(c, filename)
if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) {
t.Errorf("%s does not exist: %v", filename, err)
} else {
if err := os.Remove(filename); err != nil {
t.Errorf("cannot remove %v: %v", filename, err)
}
}
}
func TestToEpub(t *testing.T) {
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})

View File

@@ -250,27 +250,42 @@ func NewChapterFromURL(url, linkName string, configs []*ScrapeConfig, index int,
// we care about the content only if:
// - we include this level
// - we use the page name
content = article.Content
// parse HTML
doc, err := goquery.NewDocumentFromReader(strings.NewReader(article.Content))
if err != nil {
log.Fatal(err)
}
// handle lazy images
doc.Find("img").Each(func(i int, source *goquery.Selection) {
src, exists := source.Attr("data-lazy-src")
if exists {
source.SetAttr("src", src)
}
})
doc.Find("source").Remove()
// extract images
if config.ImagesOnly {
// parse HTML
doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
if err != nil {
log.Fatal(err)
}
// append every image to content
content = ""
doc.Find("img").Each(func(i int, s *goquery.Selection) {
imageTag, _ := goquery.OuterHtml(s)
imageTag = strings.ReplaceAll(imageTag, "\n", "")
// imageTag = strings.ReplaceAll(imageTag, "\n", "")
content += imageTag
})
} else {
content, err = doc.Find("[id*=readability-page]").Html()
if err != nil {
log.Fatal(err)
}
}
}
return chapter{string(body), name, article.Byline, content, subchapters, config}

View File

@@ -68,7 +68,7 @@ func TestContent(t *testing.T) {
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{config}, 0, func(index int, name string) {})
got := c.Content()
want := "<div id=\"readability-page-1\" class=\"page\">\n \n <main>\n \n <article>\n \n <ul>\n \n <li>\n <a href=\"https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/\">Discours de la Méthode</a>\n <span>\n <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\">\n <title>clock</title>\n <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 98 min read -\n 1637\n </span>\n </li>\n \n <li>\n <a href=\"https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/\">The Twelve-Factor App</a>\n <span>\n <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\">\n <title>clock</title>\n <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 22 min read -\n 2011\n </span>\n </li>\n \n </ul>\n \n\n\n\n </article>\n\n </main>\n \n\n</div>"
want := "\n \n <main>\n \n <article>\n \n <ul>\n \n <li>\n <a href=\"https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/\">Discours de la Méthode</a>\n <span>\n <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\">\n <title>clock</title>\n <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 98 min read -\n 1637\n </span>\n </li>\n \n <li>\n <a href=\"https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/\">The Twelve-Factor App</a>\n <span>\n <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\">\n <title>clock</title>\n <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 22 min read -\n 2011\n </span>\n </li>\n \n </ul>\n \n\n\n\n </article>\n\n </main>\n \n\n"
if got != want {
t.Errorf("got %v, wanted %v", got, want)