From 3eaf9da273df4f39640ccfe925dfd8f5c5753669 Mon Sep 17 00:00:00 2001 From: cornelk Date: Wed, 11 Sep 2024 08:27:53 -0600 Subject: [PATCH] support inline css --- css/css.go | 51 +++++++++ htmlindex/attributes.go | 25 ++++- htmlindex/htmlindex.go | 122 +++++++++++++++------- htmlindex/htmlindex_test.go | 4 +- scraper/css.go | 71 ------------- scraper/download.go | 37 ++++++- scraper/{css_test.go => download_test.go} | 4 +- scraper/html_test.go | 2 +- scraper/scraper.go | 2 +- scraper/scraper_test.go | 36 +++++++ 10 files changed, 238 insertions(+), 116 deletions(-) create mode 100644 css/css.go delete mode 100644 scraper/css.go rename scraper/{css_test.go => download_test.go} (94%) diff --git a/css/css.go b/css/css.go new file mode 100644 index 0000000..744f46c --- /dev/null +++ b/css/css.go @@ -0,0 +1,51 @@ +package css + +import ( + "net/url" + "regexp" + "strings" + + "github.com/cornelk/gotokit/log" + "github.com/gorilla/css/scanner" +) + +var cssURLRe = regexp.MustCompile(`^url\(['"]?(.*?)['"]?\)$`) + +type Token = scanner.Token + +type urlProcessor func(token *Token, data string, url *url.URL) + +// Process the CSS data and call a processor for every found URL. +func Process(logger *log.Logger, url *url.URL, data string, processor urlProcessor) { + css := scanner.New(data) + + for { + token := css.Next() + if token.Type == scanner.TokenEOF || token.Type == scanner.TokenError { + break + } + if token.Type != scanner.TokenURI { + continue + } + + match := cssURLRe.FindStringSubmatch(token.Value) + if match == nil { + continue + } + + src := match[1] + if strings.HasPrefix(strings.ToLower(src), "data:") { + continue // skip embedded data + } + + u, err := url.Parse(src) + if err != nil { + logger.Error("Parsing URL failed", + log.String("url", src), + log.Err(err)) + continue + } + u = url.ResolveReference(u) + processor(token, src, u) + } +} diff --git a/htmlindex/attributes.go b/htmlindex/attributes.go index 36721ce..16e2277 100644 --- a/htmlindex/attributes.go +++ b/htmlindex/attributes.go @@ -1,13 +1,29 @@ package htmlindex +import ( + "net/url" + + "github.com/cornelk/gotokit/log" + "golang.org/x/net/html" +) + +type nodeAttributeParserData struct { + logger *log.Logger + url *url.URL + node *html.Node + attribute string + value string +} + // nodeAttributeParser returns the URL values of the attribute of the node and // whether the attribute has been processed. -type nodeAttributeParser func(attribute, value string) ([]string, bool) +type nodeAttributeParser func(data nodeAttributeParserData) ([]string, bool) type Node struct { Attributes []string - parser nodeAttributeParser + noChildParsing bool + parser nodeAttributeParser } const ( @@ -27,6 +43,7 @@ const ( ImgTag = "img" LinkTag = "link" ScriptTag = "script" + StyleTag = "style" ) // Nodes describes the HTML tags and their attributes that can contain URL. @@ -47,6 +64,10 @@ var Nodes = map[string]Node{ ScriptTag: { Attributes: []string{SrcAttribute}, }, + StyleTag: { + noChildParsing: true, + parser: styleParser, + }, } // SrcSetAttributes contains the attributes that contain srcset values. diff --git a/htmlindex/htmlindex.go b/htmlindex/htmlindex.go index aaf15a1..19e7033 100644 --- a/htmlindex/htmlindex.go +++ b/htmlindex/htmlindex.go @@ -6,55 +6,64 @@ import ( "sort" "strings" + "github.com/cornelk/goscrape/css" + "github.com/cornelk/gotokit/log" "golang.org/x/net/html" ) // Index provides an index for all HTML tags of relevance for scraping. type Index struct { + logger *log.Logger + // key is HTML tag, value is a map of all its urls and the HTML nodes for it data map[string]map[string][]*html.Node } // New returns a new index. -func New() *Index { +func New(logger *log.Logger) *Index { return &Index{ - data: make(map[string]map[string][]*html.Node), + logger: logger, + data: make(map[string]map[string][]*html.Node), } } // Index the given HTML document. -func (h *Index) Index(baseURL *url.URL, node *html.Node) { +func (idx *Index) Index(baseURL *url.URL, node *html.Node) { for child := node.FirstChild; child != nil; child = child.NextSibling { - if child.Type != html.ElementNode { - continue + switch child.Type { + case html.ElementNode: + idx.indexElementNode(baseURL, node, child) + default: } + } +} - var references []string +func (idx *Index) indexElementNode(baseURL *url.URL, node, child *html.Node) { + var references []string - info, ok := Nodes[child.Data] - if ok { - references = nodeAttributeURLs(baseURL, child, info.parser, info.Attributes...) - } + info, ok := Nodes[child.Data] + if ok { + references = idx.nodeAttributeURLs(baseURL, child, info.parser, info.Attributes...) + } - m, ok := h.data[child.Data] - if !ok { - m = map[string][]*html.Node{} - h.data[child.Data] = m - } + m, ok := idx.data[child.Data] + if !ok { + m = map[string][]*html.Node{} + idx.data[child.Data] = m + } - for _, reference := range references { - m[reference] = append(m[reference], child) - } + for _, reference := range references { + m[reference] = append(m[reference], child) + } - if node.FirstChild != nil { - h.Index(baseURL, child) - } + if node.FirstChild != nil && !info.noChildParsing { + idx.Index(baseURL, child) } } // URLs returns all URLs of the references found for a specific tag. -func (h *Index) URLs(tag string) ([]*url.URL, error) { - m, ok := h.data[tag] +func (idx *Index) URLs(tag string) ([]*url.URL, error) { + m, ok := idx.data[tag] if !ok { return nil, nil } @@ -78,8 +87,8 @@ func (h *Index) URLs(tag string) ([]*url.URL, error) { } // Nodes returns a map of all URLs and their HTML nodes. -func (h *Index) Nodes(tag string) map[string][]*html.Node { - m, ok := h.data[tag] +func (idx *Index) Nodes(tag string) map[string][]*html.Node { + m, ok := idx.data[tag] if ok { return m } @@ -87,11 +96,23 @@ func (h *Index) Nodes(tag string) map[string][]*html.Node { } // nodeAttributeURLs returns resolved URLs based on the base URL and the HTML node attribute values. -func nodeAttributeURLs(baseURL *url.URL, node *html.Node, +func (idx *Index) nodeAttributeURLs(baseURL *url.URL, node *html.Node, parser nodeAttributeParser, attributeName ...string) []string { var results []string + processReferences := func(references []string) { + for _, reference := range references { + ur, err := url.Parse(reference) + if err != nil { + continue + } + + ur = baseURL.ResolveReference(ur) + results = append(results, ur.String()) + } + } + for _, attr := range node.Attr { var process bool for _, name := range attributeName { @@ -108,34 +129,44 @@ func nodeAttributeURLs(baseURL *url.URL, node *html.Node, var parserHandled bool if parser != nil { - references, parserHandled = parser(attr.Key, strings.TrimSpace(attr.Val)) + data := nodeAttributeParserData{ + logger: idx.logger, + url: baseURL, + node: node, + attribute: attr.Key, + value: strings.TrimSpace(attr.Val), + } + references, parserHandled = parser(data) } if parser == nil || !parserHandled { references = append(references, strings.TrimSpace(attr.Val)) } - for _, reference := range references { - ur, err := url.Parse(reference) - if err != nil { - continue - } + processReferences(references) + } - ur = baseURL.ResolveReference(ur) - results = append(results, ur.String()) + // special case to support style tag + if len(attributeName) == 0 && parser != nil { + data := nodeAttributeParserData{ + logger: idx.logger, + url: baseURL, + node: node, } + references, _ := parser(data) + processReferences(references) } return results } // srcSetValueSplitter returns the URL values of the srcset attribute of img nodes. -func srcSetValueSplitter(attribute, attributeValue string) ([]string, bool) { - if _, isSrcSet := SrcSetAttributes[attribute]; !isSrcSet { +func srcSetValueSplitter(data nodeAttributeParserData) ([]string, bool) { + if _, isSrcSet := SrcSetAttributes[data.attribute]; !isSrcSet { return nil, false } // split the set of responsive images - values := strings.Split(attributeValue, ",") + values := strings.Split(data.value, ",") for i, value := range values { value = strings.TrimSpace(value) @@ -145,3 +176,20 @@ func srcSetValueSplitter(attribute, attributeValue string) ([]string, bool) { return values, true } + +// styleParser returns the URL values of a CSS style tag. +func styleParser(data nodeAttributeParserData) ([]string, bool) { + if data.node.FirstChild == nil { + return nil, false + } + + var urls []string + processor := func(_ *css.Token, _ string, url *url.URL) { + urls = append(urls, url.String()) + } + + cssData := data.node.FirstChild.Data + css.Process(data.logger, data.url, cssData, processor) + + return urls, true +} diff --git a/htmlindex/htmlindex_test.go b/htmlindex/htmlindex_test.go index 9324509..33fef19 100644 --- a/htmlindex/htmlindex_test.go +++ b/htmlindex/htmlindex_test.go @@ -5,6 +5,7 @@ import ( "net/url" "testing" + "github.com/cornelk/gotokit/log" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "golang.org/x/net/html" @@ -91,7 +92,8 @@ func testSetup(t *testing.T, input []byte) *Index { ur, err := url.Parse("https://domain.com/") require.NoError(t, err) - idx := New() + logger := log.NewTestLogger(t) + idx := New(logger) idx.Index(ur, doc) return idx diff --git a/scraper/css.go b/scraper/css.go deleted file mode 100644 index 8aeee46..0000000 --- a/scraper/css.go +++ /dev/null @@ -1,71 +0,0 @@ -package scraper - -import ( - "bytes" - "fmt" - "net/url" - "path" - "regexp" - "strings" - - "github.com/cornelk/gotokit/log" - "github.com/gorilla/css/scanner" -) - -var cssURLRe = regexp.MustCompile(`^url\(['"]?(.*?)['"]?\)$`) - -func (s *Scraper) checkCSSForUrls(url *url.URL, buf *bytes.Buffer) *bytes.Buffer { - urls := make(map[string]string) - str := buf.String() - css := scanner.New(str) - - for { - token := css.Next() - if token.Type == scanner.TokenEOF || token.Type == scanner.TokenError { - break - } - if token.Type != scanner.TokenURI { - continue - } - - match := cssURLRe.FindStringSubmatch(token.Value) - if match == nil { - continue - } - - src := match[1] - if strings.HasPrefix(strings.ToLower(src), "data:") { - continue // skip embedded data - } - - u, err := url.Parse(src) - if err != nil { - s.logger.Error("Parsing URL failed", - log.String("url", src), - log.Err(err)) - continue - } - u = url.ResolveReference(u) - - s.imagesQueue = append(s.imagesQueue, u) - - cssPath := *url - cssPath.Path = path.Dir(cssPath.Path) + "/" - resolved := resolveURL(&cssPath, src, s.URL.Host, false, "") - urls[token.Value] = resolved - } - - if len(urls) == 0 { - return buf - } - - for ori, filePath := range urls { - fixed := fmt.Sprintf("url(%s)", filePath) - str = strings.ReplaceAll(str, ori, fixed) - s.logger.Debug("CSS Element relinked", - log.String("url", ori), - log.String("fixed_url", fixed)) - } - - return bytes.NewBufferString(str) -} diff --git a/scraper/download.go b/scraper/download.go index f8011e7..28d946f 100644 --- a/scraper/download.go +++ b/scraper/download.go @@ -6,7 +6,10 @@ import ( "errors" "fmt" "net/url" + "path" + "strings" + "github.com/cornelk/goscrape/css" "github.com/cornelk/goscrape/htmlindex" "github.com/cornelk/gotokit/log" ) @@ -19,6 +22,7 @@ var tagsWithReferences = []string{ htmlindex.LinkTag, htmlindex.ScriptTag, htmlindex.BodyTag, + htmlindex.StyleTag, } func (s *Scraper) downloadReferences(ctx context.Context, index *htmlindex.Index) error { @@ -44,7 +48,7 @@ func (s *Scraper) downloadReferences(ctx context.Context, index *htmlindex.Index var processor assetProcessor if tag == htmlindex.LinkTag { - processor = s.checkCSSForUrls + processor = s.cssProcessor } for _, ur := range references { if err := s.downloadAsset(ctx, ur, processor); err != nil && errors.Is(err, context.Canceled) { @@ -98,3 +102,34 @@ func (s *Scraper) downloadAsset(ctx context.Context, u *url.URL, processor asset return nil } + +func (s *Scraper) cssProcessor(baseURL *url.URL, buf *bytes.Buffer) *bytes.Buffer { + urls := make(map[string]string) + + processor := func(token *css.Token, data string, u *url.URL) { + s.imagesQueue = append(s.imagesQueue, u) + + cssPath := *u + cssPath.Path = path.Dir(cssPath.Path) + "/" + resolved := resolveURL(&cssPath, data, s.URL.Host, false, "") + urls[token.Value] = resolved + } + + data := buf.String() + css.Process(s.logger, baseURL, data, processor) + + if len(urls) == 0 { + return buf + } + + str := buf.String() + for ori, filePath := range urls { + fixed := fmt.Sprintf("url(%s)", filePath) + str = strings.ReplaceAll(str, ori, fixed) + s.logger.Debug("CSS Element relinked", + log.String("url", ori), + log.String("fixed_url", fixed)) + } + + return bytes.NewBufferString(str) +} diff --git a/scraper/css_test.go b/scraper/download_test.go similarity index 94% rename from scraper/css_test.go rename to scraper/download_test.go index f8cb777..0a625ea 100644 --- a/scraper/css_test.go +++ b/scraper/download_test.go @@ -10,7 +10,7 @@ import ( "github.com/stretchr/testify/require" ) -func TestCheckCSSForURLs(t *testing.T) { +func TestCSSProcessor(t *testing.T) { logger := log.NewTestLogger(t) cfg := Config{ URL: "http://localhost", @@ -33,7 +33,7 @@ func TestCheckCSSForURLs(t *testing.T) { for input, expected := range fixtures { s.imagesQueue = nil buf := bytes.NewBufferString(input) - s.checkCSSForUrls(u, buf) + s.cssProcessor(u, buf) if expected == "" { assert.Empty(t, s.imagesQueue) diff --git a/scraper/html_test.go b/scraper/html_test.go index b85fe91..0261f8a 100644 --- a/scraper/html_test.go +++ b/scraper/html_test.go @@ -33,7 +33,7 @@ func TestFixURLReferences(t *testing.T) { doc, err := html.Parse(buf) require.NoError(t, err) - index := htmlindex.New() + index := htmlindex.New(logger) index.Index(s.URL, doc) ref, fixed, err := s.fixURLReferences(s.URL, doc, index) diff --git a/scraper/scraper.go b/scraper/scraper.go index 113629f..ce0a115 100644 --- a/scraper/scraper.go +++ b/scraper/scraper.go @@ -218,7 +218,7 @@ func (s *Scraper) processURL(ctx context.Context, u *url.URL, currentDepth uint) return fmt.Errorf("parsing HTML: %w", err) } - index := htmlindex.New() + index := htmlindex.New(s.logger) index.Index(u, doc) s.storeDownload(u, buf, doc, index, fileExtension) diff --git a/scraper/scraper_test.go b/scraper/scraper_test.go index 4246d63..7d418f7 100644 --- a/scraper/scraper_test.go +++ b/scraper/scraper_test.go @@ -130,3 +130,39 @@ func TestScraperAttributes(t *testing.T) { } assert.Equal(t, expectedProcessed, scraper.processed) } + +func TestScraperInternalCss(t *testing.T) { + indexPage := []byte(` + + + + + + + +`) + empty := []byte(``) + + startURL := "https://example.org/" + urls := map[string][]byte{ + "https://example.org/": indexPage, + "https://example.org/background.jpg": empty, + } + + scraper := newTestScraper(t, startURL, urls) + require.NotNil(t, scraper) + + ctx := context.Background() + err := scraper.Start(ctx) + require.NoError(t, err) + + expectedProcessed := map[string]struct{}{ + "/": {}, + "/background.jpg": {}, + } + assert.Equal(t, expectedProcessed, scraper.processed) +}