use consts to reference html tags

cornelk · Jul 13, 2024 · 3b38968 · 3b38968
1 parent 674bc24
commit 3b38968
Show file tree

Hide file tree

Showing 5 changed files with 34 additions and 20 deletions.
diff --git a/htmlindex/attributes.go b/htmlindex/attributes.go
@@ -21,21 +21,29 @@ const (
 	SrcSetAttribute     = "srcset"
 )
 
+const (
+	ATag      = "a"
+	BodyTag   = "body"
+	ImgTag    = "img"
+	LinkTag   = "link"
+	ScriptTag = "script"
+)
+
 var Nodes = map[string]Node{
-	"a": {
+	ATag: {
 		Attributes: []string{HrefAttribute},
 	},
-	"body": {
+	BodyTag: {
 		Attributes: []string{BackgroundAttribute},
 	},
-	"img": {
+	ImgTag: {
 		Attributes: []string{SrcAttribute, DataSrcAttribute, SrcSetAttribute, DataSrcSetAttribute},
 		parser:     srcSetValueSplitter,
 	},
-	"link": {
+	LinkTag: {
 		Attributes: []string{HrefAttribute},
 	},
-	"script": {
+	ScriptTag: {
 		Attributes: []string{SrcAttribute},
 	},
 }
diff --git a/htmlindex/htmlindex_test.go b/htmlindex/htmlindex_test.go
@@ -65,14 +65,14 @@ func TestIndexImg(t *testing.T) {
 `)
 
 	idx := testSetup(t, input)
-	references, err := idx.URLs("img")
+	references, err := idx.URLs(ImgTag)
 	require.NoError(t, err)
 	require.Len(t, references, 3)
 	assert.Equal(t, "https://domain.com/test-480w.jpg", references[0].String())
 	assert.Equal(t, "https://domain.com/test-800w.jpg", references[1].String())
 	assert.Equal(t, "https://domain.com/test.jpg", references[2].String())
 
-	references, err = idx.URLs("body")
+	references, err = idx.URLs(BodyTag)
 	require.NoError(t, err)
 	require.Len(t, references, 1)
 	assert.Equal(t, "https://domain.com/bg.jpg", references[0].String())

diff --git a/scraper/download.go b/scraper/download.go
@@ -16,15 +16,21 @@ import (
 type assetProcessor func(URL *url.URL, buf *bytes.Buffer) *bytes.Buffer
 
 var tagsWithReferences = []string{
-	"link",
-	"script",
-	"body",
+	htmlindex.LinkTag,
+	htmlindex.ScriptTag,
+	htmlindex.BodyTag,
 }
 
 func (s *Scraper) downloadReferences(ctx context.Context, index *htmlindex.Index) error {
-	references, err := index.URLs("img")
+	references, err := index.URLs(htmlindex.BodyTag)
 	if err != nil {
-		s.logger.Error("Getting img nodes URLs failed", log.Err(err))
+		s.logger.Error("Getting body node URLs failed", log.Err(err))
+	}
+	s.imagesQueue = append(s.imagesQueue, references...)
+
+	references, err = index.URLs("img")
+	if err != nil {
+		s.logger.Error("Getting img node URLs failed", log.Err(err))
 	}
 	s.imagesQueue = append(s.imagesQueue, references...)
 
@@ -37,7 +43,7 @@ func (s *Scraper) downloadReferences(ctx context.Context, index *htmlindex.Index
 		}
 
 		var processor assetProcessor
-		if tag == "link" {
+		if tag == htmlindex.LinkTag {
 			processor = s.checkCSSForUrls
 		}
 		for _, ur := range references {

diff --git a/scraper/html.go b/scraper/html.go
@@ -38,11 +38,11 @@ func (s *Scraper) fixURLReferences(url *url.URL, doc *html.Node,
 }
 
 var nodeAttributes = map[string]string{
-	"a":      htmlindex.HrefAttribute,
-	"img":    htmlindex.SrcAttribute,
-	"link":   htmlindex.HrefAttribute,
-	"script": htmlindex.SrcAttribute,
-	"body":   htmlindex.BackgroundAttribute,
+	htmlindex.ATag:      htmlindex.HrefAttribute,
+	htmlindex.BodyTag:   htmlindex.BackgroundAttribute,
+	htmlindex.ImgTag:    htmlindex.SrcAttribute,
+	htmlindex.LinkTag:   htmlindex.HrefAttribute,
+	htmlindex.ScriptTag: htmlindex.SrcAttribute,
 }
 
 // fixHTMLNodeURLs processes all HTML nodes that contain URLs that need to be fixed
@@ -51,7 +51,7 @@ func (s *Scraper) fixHTMLNodeURLs(baseURL *url.URL, relativeToRoot string, index
 	var changed bool
 
 	for tag, attribute := range nodeAttributes {
-		isHyperlink := tag == "a"
+		isHyperlink := tag == htmlindex.ATag
 
 		urls := index.Nodes(tag)
 		for _, nodes := range urls {

diff --git a/scraper/scraper.go b/scraper/scraper.go
@@ -230,7 +230,7 @@ func (s *Scraper) processURL(ctx context.Context, u *url.URL, currentDepth uint)
 	// check first and download afterward to not hit max depth limit for
 	// start page links because of recursive linking
 	// a hrefs
-	references, err := index.URLs("a")
+	references, err := index.URLs(htmlindex.ATag)
 	if err != nil {
 		s.logger.Error("Parsing URL failed", log.Err(err))
 	}