From 3eaf9da273df4f39640ccfe925dfd8f5c5753669 Mon Sep 17 00:00:00 2001
From: cornelk <github@codejuggle.dj>
Date: Wed, 11 Sep 2024 08:27:53 -0600
Subject: [PATCH] support inline css

---
 css/css.go                                |  51 +++++++++
 htmlindex/attributes.go                   |  25 ++++-
 htmlindex/htmlindex.go                    | 122 +++++++++++++++-------
 htmlindex/htmlindex_test.go               |   4 +-
 scraper/css.go                            |  71 -------------
 scraper/download.go                       |  37 ++++++-
 scraper/{css_test.go => download_test.go} |   4 +-
 scraper/html_test.go                      |   2 +-
 scraper/scraper.go                        |   2 +-
 scraper/scraper_test.go                   |  36 +++++++
 10 files changed, 238 insertions(+), 116 deletions(-)
 create mode 100644 css/css.go
 delete mode 100644 scraper/css.go
 rename scraper/{css_test.go => download_test.go} (94%)

diff --git a/css/css.go b/css/css.go
new file mode 100644
index 0000000..744f46c
--- /dev/null
+++ b/css/css.go
@@ -0,0 +1,51 @@
+package css
+
+import (
+	"net/url"
+	"regexp"
+	"strings"
+
+	"github.com/cornelk/gotokit/log"
+	"github.com/gorilla/css/scanner"
+)
+
+var cssURLRe = regexp.MustCompile(`^url\(['"]?(.*?)['"]?\)$`)
+
+type Token = scanner.Token
+
+type urlProcessor func(token *Token, data string, url *url.URL)
+
+// Process the CSS data and call a processor for every found URL.
+func Process(logger *log.Logger, url *url.URL, data string, processor urlProcessor) {
+	css := scanner.New(data)
+
+	for {
+		token := css.Next()
+		if token.Type == scanner.TokenEOF || token.Type == scanner.TokenError {
+			break
+		}
+		if token.Type != scanner.TokenURI {
+			continue
+		}
+
+		match := cssURLRe.FindStringSubmatch(token.Value)
+		if match == nil {
+			continue
+		}
+
+		src := match[1]
+		if strings.HasPrefix(strings.ToLower(src), "data:") {
+			continue // skip embedded data
+		}
+
+		u, err := url.Parse(src)
+		if err != nil {
+			logger.Error("Parsing URL failed",
+				log.String("url", src),
+				log.Err(err))
+			continue
+		}
+		u = url.ResolveReference(u)
+		processor(token, src, u)
+	}
+}
diff --git a/htmlindex/attributes.go b/htmlindex/attributes.go
index 36721ce..16e2277 100644
--- a/htmlindex/attributes.go
+++ b/htmlindex/attributes.go
@@ -1,13 +1,29 @@
 package htmlindex
 
+import (
+	"net/url"
+
+	"github.com/cornelk/gotokit/log"
+	"golang.org/x/net/html"
+)
+
+type nodeAttributeParserData struct {
+	logger    *log.Logger
+	url       *url.URL
+	node      *html.Node
+	attribute string
+	value     string
+}
+
 // nodeAttributeParser returns the URL values of the attribute of the node and
 // whether the attribute has been processed.
-type nodeAttributeParser func(attribute, value string) ([]string, bool)
+type nodeAttributeParser func(data nodeAttributeParserData) ([]string, bool)
 
 type Node struct {
 	Attributes []string
 
-	parser nodeAttributeParser
+	noChildParsing bool
+	parser         nodeAttributeParser
 }
 
 const (
@@ -27,6 +43,7 @@ const (
 	ImgTag    = "img"
 	LinkTag   = "link"
 	ScriptTag = "script"
+	StyleTag  = "style"
 )
 
 // Nodes describes the HTML tags and their attributes that can contain URL.
@@ -47,6 +64,10 @@ var Nodes = map[string]Node{
 	ScriptTag: {
 		Attributes: []string{SrcAttribute},
 	},
+	StyleTag: {
+		noChildParsing: true,
+		parser:         styleParser,
+	},
 }
 
 // SrcSetAttributes contains the attributes that contain srcset values.
diff --git a/htmlindex/htmlindex.go b/htmlindex/htmlindex.go
index aaf15a1..19e7033 100644
--- a/htmlindex/htmlindex.go
+++ b/htmlindex/htmlindex.go
@@ -6,55 +6,64 @@ import (
 	"sort"
 	"strings"
 
+	"github.com/cornelk/goscrape/css"
+	"github.com/cornelk/gotokit/log"
 	"golang.org/x/net/html"
 )
 
 // Index provides an index for all HTML tags of relevance for scraping.
 type Index struct {
+	logger *log.Logger
+
 	// key is HTML tag, value is a map of all its urls and the HTML nodes for it
 	data map[string]map[string][]*html.Node
 }
 
 // New returns a new index.
-func New() *Index {
+func New(logger *log.Logger) *Index {
 	return &Index{
-		data: make(map[string]map[string][]*html.Node),
+		logger: logger,
+		data:   make(map[string]map[string][]*html.Node),
 	}
 }
 
 // Index the given HTML document.
-func (h *Index) Index(baseURL *url.URL, node *html.Node) {
+func (idx *Index) Index(baseURL *url.URL, node *html.Node) {
 	for child := node.FirstChild; child != nil; child = child.NextSibling {
-		if child.Type != html.ElementNode {
-			continue
+		switch child.Type {
+		case html.ElementNode:
+			idx.indexElementNode(baseURL, node, child)
+		default:
 		}
+	}
+}
 
-		var references []string
+func (idx *Index) indexElementNode(baseURL *url.URL, node, child *html.Node) {
+	var references []string
 
-		info, ok := Nodes[child.Data]
-		if ok {
-			references = nodeAttributeURLs(baseURL, child, info.parser, info.Attributes...)
-		}
+	info, ok := Nodes[child.Data]
+	if ok {
+		references = idx.nodeAttributeURLs(baseURL, child, info.parser, info.Attributes...)
+	}
 
-		m, ok := h.data[child.Data]
-		if !ok {
-			m = map[string][]*html.Node{}
-			h.data[child.Data] = m
-		}
+	m, ok := idx.data[child.Data]
+	if !ok {
+		m = map[string][]*html.Node{}
+		idx.data[child.Data] = m
+	}
 
-		for _, reference := range references {
-			m[reference] = append(m[reference], child)
-		}
+	for _, reference := range references {
+		m[reference] = append(m[reference], child)
+	}
 
-		if node.FirstChild != nil {
-			h.Index(baseURL, child)
-		}
+	if node.FirstChild != nil && !info.noChildParsing {
+		idx.Index(baseURL, child)
 	}
 }
 
 // URLs returns all URLs of the references found for a specific tag.
-func (h *Index) URLs(tag string) ([]*url.URL, error) {
-	m, ok := h.data[tag]
+func (idx *Index) URLs(tag string) ([]*url.URL, error) {
+	m, ok := idx.data[tag]
 	if !ok {
 		return nil, nil
 	}
@@ -78,8 +87,8 @@ func (h *Index) URLs(tag string) ([]*url.URL, error) {
 }
 
 // Nodes returns a map of all URLs and their HTML nodes.
-func (h *Index) Nodes(tag string) map[string][]*html.Node {
-	m, ok := h.data[tag]
+func (idx *Index) Nodes(tag string) map[string][]*html.Node {
+	m, ok := idx.data[tag]
 	if ok {
 		return m
 	}
@@ -87,11 +96,23 @@ func (h *Index) Nodes(tag string) map[string][]*html.Node {
 }
 
 // nodeAttributeURLs returns resolved URLs based on the base URL and the HTML node attribute values.
-func nodeAttributeURLs(baseURL *url.URL, node *html.Node,
+func (idx *Index) nodeAttributeURLs(baseURL *url.URL, node *html.Node,
 	parser nodeAttributeParser, attributeName ...string) []string {
 
 	var results []string
 
+	processReferences := func(references []string) {
+		for _, reference := range references {
+			ur, err := url.Parse(reference)
+			if err != nil {
+				continue
+			}
+
+			ur = baseURL.ResolveReference(ur)
+			results = append(results, ur.String())
+		}
+	}
+
 	for _, attr := range node.Attr {
 		var process bool
 		for _, name := range attributeName {
@@ -108,34 +129,44 @@ func nodeAttributeURLs(baseURL *url.URL, node *html.Node,
 		var parserHandled bool
 
 		if parser != nil {
-			references, parserHandled = parser(attr.Key, strings.TrimSpace(attr.Val))
+			data := nodeAttributeParserData{
+				logger:    idx.logger,
+				url:       baseURL,
+				node:      node,
+				attribute: attr.Key,
+				value:     strings.TrimSpace(attr.Val),
+			}
+			references, parserHandled = parser(data)
 		}
 		if parser == nil || !parserHandled {
 			references = append(references, strings.TrimSpace(attr.Val))
 		}
 
-		for _, reference := range references {
-			ur, err := url.Parse(reference)
-			if err != nil {
-				continue
-			}
+		processReferences(references)
+	}
 
-			ur = baseURL.ResolveReference(ur)
-			results = append(results, ur.String())
+	// special case to support style tag
+	if len(attributeName) == 0 && parser != nil {
+		data := nodeAttributeParserData{
+			logger: idx.logger,
+			url:    baseURL,
+			node:   node,
 		}
+		references, _ := parser(data)
+		processReferences(references)
 	}
 
 	return results
 }
 
 // srcSetValueSplitter returns the URL values of the srcset attribute of img nodes.
-func srcSetValueSplitter(attribute, attributeValue string) ([]string, bool) {
-	if _, isSrcSet := SrcSetAttributes[attribute]; !isSrcSet {
+func srcSetValueSplitter(data nodeAttributeParserData) ([]string, bool) {
+	if _, isSrcSet := SrcSetAttributes[data.attribute]; !isSrcSet {
 		return nil, false
 	}
 
 	// split the set of responsive images
-	values := strings.Split(attributeValue, ",")
+	values := strings.Split(data.value, ",")
 
 	for i, value := range values {
 		value = strings.TrimSpace(value)
@@ -145,3 +176,20 @@ func srcSetValueSplitter(attribute, attributeValue string) ([]string, bool) {
 
 	return values, true
 }
+
+// styleParser returns the URL values of a CSS style tag.
+func styleParser(data nodeAttributeParserData) ([]string, bool) {
+	if data.node.FirstChild == nil {
+		return nil, false
+	}
+
+	var urls []string
+	processor := func(_ *css.Token, _ string, url *url.URL) {
+		urls = append(urls, url.String())
+	}
+
+	cssData := data.node.FirstChild.Data
+	css.Process(data.logger, data.url, cssData, processor)
+
+	return urls, true
+}
diff --git a/htmlindex/htmlindex_test.go b/htmlindex/htmlindex_test.go
index 9324509..33fef19 100644
--- a/htmlindex/htmlindex_test.go
+++ b/htmlindex/htmlindex_test.go
@@ -5,6 +5,7 @@ import (
 	"net/url"
 	"testing"
 
+	"github.com/cornelk/gotokit/log"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 	"golang.org/x/net/html"
@@ -91,7 +92,8 @@ func testSetup(t *testing.T, input []byte) *Index {
 	ur, err := url.Parse("https://domain.com/")
 	require.NoError(t, err)
 
-	idx := New()
+	logger := log.NewTestLogger(t)
+	idx := New(logger)
 	idx.Index(ur, doc)
 
 	return idx
diff --git a/scraper/css.go b/scraper/css.go
deleted file mode 100644
index 8aeee46..0000000
--- a/scraper/css.go
+++ /dev/null
@@ -1,71 +0,0 @@
-package scraper
-
-import (
-	"bytes"
-	"fmt"
-	"net/url"
-	"path"
-	"regexp"
-	"strings"
-
-	"github.com/cornelk/gotokit/log"
-	"github.com/gorilla/css/scanner"
-)
-
-var cssURLRe = regexp.MustCompile(`^url\(['"]?(.*?)['"]?\)$`)
-
-func (s *Scraper) checkCSSForUrls(url *url.URL, buf *bytes.Buffer) *bytes.Buffer {
-	urls := make(map[string]string)
-	str := buf.String()
-	css := scanner.New(str)
-
-	for {
-		token := css.Next()
-		if token.Type == scanner.TokenEOF || token.Type == scanner.TokenError {
-			break
-		}
-		if token.Type != scanner.TokenURI {
-			continue
-		}
-
-		match := cssURLRe.FindStringSubmatch(token.Value)
-		if match == nil {
-			continue
-		}
-
-		src := match[1]
-		if strings.HasPrefix(strings.ToLower(src), "data:") {
-			continue // skip embedded data
-		}
-
-		u, err := url.Parse(src)
-		if err != nil {
-			s.logger.Error("Parsing URL failed",
-				log.String("url", src),
-				log.Err(err))
-			continue
-		}
-		u = url.ResolveReference(u)
-
-		s.imagesQueue = append(s.imagesQueue, u)
-
-		cssPath := *url
-		cssPath.Path = path.Dir(cssPath.Path) + "/"
-		resolved := resolveURL(&cssPath, src, s.URL.Host, false, "")
-		urls[token.Value] = resolved
-	}
-
-	if len(urls) == 0 {
-		return buf
-	}
-
-	for ori, filePath := range urls {
-		fixed := fmt.Sprintf("url(%s)", filePath)
-		str = strings.ReplaceAll(str, ori, fixed)
-		s.logger.Debug("CSS Element relinked",
-			log.String("url", ori),
-			log.String("fixed_url", fixed))
-	}
-
-	return bytes.NewBufferString(str)
-}
diff --git a/scraper/download.go b/scraper/download.go
index f8011e7..28d946f 100644
--- a/scraper/download.go
+++ b/scraper/download.go
@@ -6,7 +6,10 @@ import (
 	"errors"
 	"fmt"
 	"net/url"
+	"path"
+	"strings"
 
+	"github.com/cornelk/goscrape/css"
 	"github.com/cornelk/goscrape/htmlindex"
 	"github.com/cornelk/gotokit/log"
 )
@@ -19,6 +22,7 @@ var tagsWithReferences = []string{
 	htmlindex.LinkTag,
 	htmlindex.ScriptTag,
 	htmlindex.BodyTag,
+	htmlindex.StyleTag,
 }
 
 func (s *Scraper) downloadReferences(ctx context.Context, index *htmlindex.Index) error {
@@ -44,7 +48,7 @@ func (s *Scraper) downloadReferences(ctx context.Context, index *htmlindex.Index
 
 		var processor assetProcessor
 		if tag == htmlindex.LinkTag {
-			processor = s.checkCSSForUrls
+			processor = s.cssProcessor
 		}
 		for _, ur := range references {
 			if err := s.downloadAsset(ctx, ur, processor); err != nil && errors.Is(err, context.Canceled) {
@@ -98,3 +102,34 @@ func (s *Scraper) downloadAsset(ctx context.Context, u *url.URL, processor asset
 
 	return nil
 }
+
+func (s *Scraper) cssProcessor(baseURL *url.URL, buf *bytes.Buffer) *bytes.Buffer {
+	urls := make(map[string]string)
+
+	processor := func(token *css.Token, data string, u *url.URL) {
+		s.imagesQueue = append(s.imagesQueue, u)
+
+		cssPath := *u
+		cssPath.Path = path.Dir(cssPath.Path) + "/"
+		resolved := resolveURL(&cssPath, data, s.URL.Host, false, "")
+		urls[token.Value] = resolved
+	}
+
+	data := buf.String()
+	css.Process(s.logger, baseURL, data, processor)
+
+	if len(urls) == 0 {
+		return buf
+	}
+
+	str := buf.String()
+	for ori, filePath := range urls {
+		fixed := fmt.Sprintf("url(%s)", filePath)
+		str = strings.ReplaceAll(str, ori, fixed)
+		s.logger.Debug("CSS Element relinked",
+			log.String("url", ori),
+			log.String("fixed_url", fixed))
+	}
+
+	return bytes.NewBufferString(str)
+}
diff --git a/scraper/css_test.go b/scraper/download_test.go
similarity index 94%
rename from scraper/css_test.go
rename to scraper/download_test.go
index f8cb777..0a625ea 100644
--- a/scraper/css_test.go
+++ b/scraper/download_test.go
@@ -10,7 +10,7 @@ import (
 	"github.com/stretchr/testify/require"
 )
 
-func TestCheckCSSForURLs(t *testing.T) {
+func TestCSSProcessor(t *testing.T) {
 	logger := log.NewTestLogger(t)
 	cfg := Config{
 		URL: "http://localhost",
@@ -33,7 +33,7 @@ func TestCheckCSSForURLs(t *testing.T) {
 	for input, expected := range fixtures {
 		s.imagesQueue = nil
 		buf := bytes.NewBufferString(input)
-		s.checkCSSForUrls(u, buf)
+		s.cssProcessor(u, buf)
 
 		if expected == "" {
 			assert.Empty(t, s.imagesQueue)
diff --git a/scraper/html_test.go b/scraper/html_test.go
index b85fe91..0261f8a 100644
--- a/scraper/html_test.go
+++ b/scraper/html_test.go
@@ -33,7 +33,7 @@ func TestFixURLReferences(t *testing.T) {
 	doc, err := html.Parse(buf)
 	require.NoError(t, err)
 
-	index := htmlindex.New()
+	index := htmlindex.New(logger)
 	index.Index(s.URL, doc)
 
 	ref, fixed, err := s.fixURLReferences(s.URL, doc, index)
diff --git a/scraper/scraper.go b/scraper/scraper.go
index 113629f..ce0a115 100644
--- a/scraper/scraper.go
+++ b/scraper/scraper.go
@@ -218,7 +218,7 @@ func (s *Scraper) processURL(ctx context.Context, u *url.URL, currentDepth uint)
 		return fmt.Errorf("parsing HTML: %w", err)
 	}
 
-	index := htmlindex.New()
+	index := htmlindex.New(s.logger)
 	index.Index(u, doc)
 
 	s.storeDownload(u, buf, doc, index, fileExtension)
diff --git a/scraper/scraper_test.go b/scraper/scraper_test.go
index 4246d63..7d418f7 100644
--- a/scraper/scraper_test.go
+++ b/scraper/scraper_test.go
@@ -130,3 +130,39 @@ func TestScraperAttributes(t *testing.T) {
 	}
 	assert.Equal(t, expectedProcessed, scraper.processed)
 }
+
+func TestScraperInternalCss(t *testing.T) {
+	indexPage := []byte(`
+<html>
+<head>
+<style>
+h1 {
+  background-image: url('/background.jpg');
+}
+</style>
+</head>
+<body>
+</body>
+</html>
+`)
+	empty := []byte(``)
+
+	startURL := "https://example.org/"
+	urls := map[string][]byte{
+		"https://example.org/":               indexPage,
+		"https://example.org/background.jpg": empty,
+	}
+
+	scraper := newTestScraper(t, startURL, urls)
+	require.NotNil(t, scraper)
+
+	ctx := context.Background()
+	err := scraper.Start(ctx)
+	require.NoError(t, err)
+
+	expectedProcessed := map[string]struct{}{
+		"/":               {},
+		"/background.jpg": {},
+	}
+	assert.Equal(t, expectedProcessed, scraper.processed)
+}