diff --git a/htmlindex/attributes.go b/htmlindex/attributes.go
index b1bb08c..c760dfd 100644
--- a/htmlindex/attributes.go
+++ b/htmlindex/attributes.go
@@ -21,21 +21,29 @@ const (
SrcSetAttribute = "srcset"
)
+const (
+ ATag = "a"
+ BodyTag = "body"
+ ImgTag = "img"
+ LinkTag = "link"
+ ScriptTag = "script"
+)
+
var Nodes = map[string]Node{
- "a": {
+ ATag: {
Attributes: []string{HrefAttribute},
},
- "body": {
+ BodyTag: {
Attributes: []string{BackgroundAttribute},
},
- "img": {
+ ImgTag: {
Attributes: []string{SrcAttribute, DataSrcAttribute, SrcSetAttribute, DataSrcSetAttribute},
parser: srcSetValueSplitter,
},
- "link": {
+ LinkTag: {
Attributes: []string{HrefAttribute},
},
- "script": {
+ ScriptTag: {
Attributes: []string{SrcAttribute},
},
}
diff --git a/htmlindex/htmlindex_test.go b/htmlindex/htmlindex_test.go
index edd2ed5..9324509 100644
--- a/htmlindex/htmlindex_test.go
+++ b/htmlindex/htmlindex_test.go
@@ -65,14 +65,14 @@ func TestIndexImg(t *testing.T) {
`)
idx := testSetup(t, input)
- references, err := idx.URLs("img")
+ references, err := idx.URLs(ImgTag)
require.NoError(t, err)
require.Len(t, references, 3)
assert.Equal(t, "https://domain.com/test-480w.jpg", references[0].String())
assert.Equal(t, "https://domain.com/test-800w.jpg", references[1].String())
assert.Equal(t, "https://domain.com/test.jpg", references[2].String())
- references, err = idx.URLs("body")
+ references, err = idx.URLs(BodyTag)
require.NoError(t, err)
require.Len(t, references, 1)
assert.Equal(t, "https://domain.com/bg.jpg", references[0].String())
diff --git a/scraper/download.go b/scraper/download.go
index b7b4311..4c1b487 100644
--- a/scraper/download.go
+++ b/scraper/download.go
@@ -16,15 +16,21 @@ import (
type assetProcessor func(URL *url.URL, buf *bytes.Buffer) *bytes.Buffer
var tagsWithReferences = []string{
- "link",
- "script",
- "body",
+ htmlindex.LinkTag,
+ htmlindex.ScriptTag,
+ htmlindex.BodyTag,
}
func (s *Scraper) downloadReferences(ctx context.Context, index *htmlindex.Index) error {
- references, err := index.URLs("img")
+ references, err := index.URLs(htmlindex.BodyTag)
if err != nil {
- s.logger.Error("Getting img nodes URLs failed", log.Err(err))
+ s.logger.Error("Getting body node URLs failed", log.Err(err))
+ }
+ s.imagesQueue = append(s.imagesQueue, references...)
+
+ references, err = index.URLs("img")
+ if err != nil {
+ s.logger.Error("Getting img node URLs failed", log.Err(err))
}
s.imagesQueue = append(s.imagesQueue, references...)
@@ -37,7 +43,7 @@ func (s *Scraper) downloadReferences(ctx context.Context, index *htmlindex.Index
}
var processor assetProcessor
- if tag == "link" {
+ if tag == htmlindex.LinkTag {
processor = s.checkCSSForUrls
}
for _, ur := range references {
diff --git a/scraper/html.go b/scraper/html.go
index 8be57f9..94b6091 100644
--- a/scraper/html.go
+++ b/scraper/html.go
@@ -38,11 +38,11 @@ func (s *Scraper) fixURLReferences(url *url.URL, doc *html.Node,
}
var nodeAttributes = map[string]string{
- "a": htmlindex.HrefAttribute,
- "img": htmlindex.SrcAttribute,
- "link": htmlindex.HrefAttribute,
- "script": htmlindex.SrcAttribute,
- "body": htmlindex.BackgroundAttribute,
+ htmlindex.ATag: htmlindex.HrefAttribute,
+ htmlindex.BodyTag: htmlindex.BackgroundAttribute,
+ htmlindex.ImgTag: htmlindex.SrcAttribute,
+ htmlindex.LinkTag: htmlindex.HrefAttribute,
+ htmlindex.ScriptTag: htmlindex.SrcAttribute,
}
// fixHTMLNodeURLs processes all HTML nodes that contain URLs that need to be fixed
@@ -51,7 +51,7 @@ func (s *Scraper) fixHTMLNodeURLs(baseURL *url.URL, relativeToRoot string, index
var changed bool
for tag, attribute := range nodeAttributes {
- isHyperlink := tag == "a"
+ isHyperlink := tag == htmlindex.ATag
urls := index.Nodes(tag)
for _, nodes := range urls {
diff --git a/scraper/scraper.go b/scraper/scraper.go
index 501a78d..113629f 100644
--- a/scraper/scraper.go
+++ b/scraper/scraper.go
@@ -230,7 +230,7 @@ func (s *Scraper) processURL(ctx context.Context, u *url.URL, currentDepth uint)
// check first and download afterward to not hit max depth limit for
// start page links because of recursive linking
// a hrefs
- references, err := index.URLs("a")
+ references, err := index.URLs(htmlindex.ATag)
if err != nil {
s.logger.Error("Parsing URL failed", log.Err(err))
}