diff --git a/htmlindex/attributes.go b/htmlindex/attributes.go index b1bb08c..c760dfd 100644 --- a/htmlindex/attributes.go +++ b/htmlindex/attributes.go @@ -21,21 +21,29 @@ const ( SrcSetAttribute = "srcset" ) +const ( + ATag = "a" + BodyTag = "body" + ImgTag = "img" + LinkTag = "link" + ScriptTag = "script" +) + var Nodes = map[string]Node{ - "a": { + ATag: { Attributes: []string{HrefAttribute}, }, - "body": { + BodyTag: { Attributes: []string{BackgroundAttribute}, }, - "img": { + ImgTag: { Attributes: []string{SrcAttribute, DataSrcAttribute, SrcSetAttribute, DataSrcSetAttribute}, parser: srcSetValueSplitter, }, - "link": { + LinkTag: { Attributes: []string{HrefAttribute}, }, - "script": { + ScriptTag: { Attributes: []string{SrcAttribute}, }, } diff --git a/htmlindex/htmlindex_test.go b/htmlindex/htmlindex_test.go index edd2ed5..9324509 100644 --- a/htmlindex/htmlindex_test.go +++ b/htmlindex/htmlindex_test.go @@ -65,14 +65,14 @@ func TestIndexImg(t *testing.T) { `) idx := testSetup(t, input) - references, err := idx.URLs("img") + references, err := idx.URLs(ImgTag) require.NoError(t, err) require.Len(t, references, 3) assert.Equal(t, "https://domain.com/test-480w.jpg", references[0].String()) assert.Equal(t, "https://domain.com/test-800w.jpg", references[1].String()) assert.Equal(t, "https://domain.com/test.jpg", references[2].String()) - references, err = idx.URLs("body") + references, err = idx.URLs(BodyTag) require.NoError(t, err) require.Len(t, references, 1) assert.Equal(t, "https://domain.com/bg.jpg", references[0].String()) diff --git a/scraper/download.go b/scraper/download.go index b7b4311..4c1b487 100644 --- a/scraper/download.go +++ b/scraper/download.go @@ -16,15 +16,21 @@ import ( type assetProcessor func(URL *url.URL, buf *bytes.Buffer) *bytes.Buffer var tagsWithReferences = []string{ - "link", - "script", - "body", + htmlindex.LinkTag, + htmlindex.ScriptTag, + htmlindex.BodyTag, } func (s *Scraper) downloadReferences(ctx context.Context, index *htmlindex.Index) error { - references, err := index.URLs("img") + references, err := index.URLs(htmlindex.BodyTag) if err != nil { - s.logger.Error("Getting img nodes URLs failed", log.Err(err)) + s.logger.Error("Getting body node URLs failed", log.Err(err)) + } + s.imagesQueue = append(s.imagesQueue, references...) + + references, err = index.URLs("img") + if err != nil { + s.logger.Error("Getting img node URLs failed", log.Err(err)) } s.imagesQueue = append(s.imagesQueue, references...) @@ -37,7 +43,7 @@ func (s *Scraper) downloadReferences(ctx context.Context, index *htmlindex.Index } var processor assetProcessor - if tag == "link" { + if tag == htmlindex.LinkTag { processor = s.checkCSSForUrls } for _, ur := range references { diff --git a/scraper/html.go b/scraper/html.go index 8be57f9..94b6091 100644 --- a/scraper/html.go +++ b/scraper/html.go @@ -38,11 +38,11 @@ func (s *Scraper) fixURLReferences(url *url.URL, doc *html.Node, } var nodeAttributes = map[string]string{ - "a": htmlindex.HrefAttribute, - "img": htmlindex.SrcAttribute, - "link": htmlindex.HrefAttribute, - "script": htmlindex.SrcAttribute, - "body": htmlindex.BackgroundAttribute, + htmlindex.ATag: htmlindex.HrefAttribute, + htmlindex.BodyTag: htmlindex.BackgroundAttribute, + htmlindex.ImgTag: htmlindex.SrcAttribute, + htmlindex.LinkTag: htmlindex.HrefAttribute, + htmlindex.ScriptTag: htmlindex.SrcAttribute, } // fixHTMLNodeURLs processes all HTML nodes that contain URLs that need to be fixed @@ -51,7 +51,7 @@ func (s *Scraper) fixHTMLNodeURLs(baseURL *url.URL, relativeToRoot string, index var changed bool for tag, attribute := range nodeAttributes { - isHyperlink := tag == "a" + isHyperlink := tag == htmlindex.ATag urls := index.Nodes(tag) for _, nodes := range urls { diff --git a/scraper/scraper.go b/scraper/scraper.go index 501a78d..113629f 100644 --- a/scraper/scraper.go +++ b/scraper/scraper.go @@ -230,7 +230,7 @@ func (s *Scraper) processURL(ctx context.Context, u *url.URL, currentDepth uint) // check first and download afterward to not hit max depth limit for // start page links because of recursive linking // a hrefs - references, err := index.URLs("a") + references, err := index.URLs(htmlindex.ATag) if err != nil { s.logger.Error("Parsing URL failed", log.Err(err)) }