Skip to content

Commit

Permalink
use consts to reference html tags
Browse files Browse the repository at this point in the history
  • Loading branch information
cornelk committed Jul 13, 2024
1 parent 674bc24 commit 3b38968
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 20 deletions.
18 changes: 13 additions & 5 deletions htmlindex/attributes.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,21 +21,29 @@ const (
SrcSetAttribute = "srcset"
)

const (
ATag = "a"
BodyTag = "body"
ImgTag = "img"
LinkTag = "link"
ScriptTag = "script"
)

var Nodes = map[string]Node{
"a": {
ATag: {
Attributes: []string{HrefAttribute},
},
"body": {
BodyTag: {
Attributes: []string{BackgroundAttribute},
},
"img": {
ImgTag: {
Attributes: []string{SrcAttribute, DataSrcAttribute, SrcSetAttribute, DataSrcSetAttribute},
parser: srcSetValueSplitter,
},
"link": {
LinkTag: {
Attributes: []string{HrefAttribute},
},
"script": {
ScriptTag: {
Attributes: []string{SrcAttribute},
},
}
4 changes: 2 additions & 2 deletions htmlindex/htmlindex_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,14 +65,14 @@ func TestIndexImg(t *testing.T) {
`)

idx := testSetup(t, input)
references, err := idx.URLs("img")
references, err := idx.URLs(ImgTag)
require.NoError(t, err)
require.Len(t, references, 3)
assert.Equal(t, "https://domain.com/test-480w.jpg", references[0].String())
assert.Equal(t, "https://domain.com/test-800w.jpg", references[1].String())
assert.Equal(t, "https://domain.com/test.jpg", references[2].String())

references, err = idx.URLs("body")
references, err = idx.URLs(BodyTag)
require.NoError(t, err)
require.Len(t, references, 1)
assert.Equal(t, "https://domain.com/bg.jpg", references[0].String())
Expand Down
18 changes: 12 additions & 6 deletions scraper/download.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,21 @@ import (
type assetProcessor func(URL *url.URL, buf *bytes.Buffer) *bytes.Buffer

var tagsWithReferences = []string{
"link",
"script",
"body",
htmlindex.LinkTag,
htmlindex.ScriptTag,
htmlindex.BodyTag,
}

func (s *Scraper) downloadReferences(ctx context.Context, index *htmlindex.Index) error {
references, err := index.URLs("img")
references, err := index.URLs(htmlindex.BodyTag)
if err != nil {
s.logger.Error("Getting img nodes URLs failed", log.Err(err))
s.logger.Error("Getting body node URLs failed", log.Err(err))
}
s.imagesQueue = append(s.imagesQueue, references...)

references, err = index.URLs("img")
if err != nil {
s.logger.Error("Getting img node URLs failed", log.Err(err))
}
s.imagesQueue = append(s.imagesQueue, references...)

Expand All @@ -37,7 +43,7 @@ func (s *Scraper) downloadReferences(ctx context.Context, index *htmlindex.Index
}

var processor assetProcessor
if tag == "link" {
if tag == htmlindex.LinkTag {
processor = s.checkCSSForUrls
}
for _, ur := range references {
Expand Down
12 changes: 6 additions & 6 deletions scraper/html.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,11 @@ func (s *Scraper) fixURLReferences(url *url.URL, doc *html.Node,
}

var nodeAttributes = map[string]string{
"a": htmlindex.HrefAttribute,
"img": htmlindex.SrcAttribute,
"link": htmlindex.HrefAttribute,
"script": htmlindex.SrcAttribute,
"body": htmlindex.BackgroundAttribute,
htmlindex.ATag: htmlindex.HrefAttribute,
htmlindex.BodyTag: htmlindex.BackgroundAttribute,
htmlindex.ImgTag: htmlindex.SrcAttribute,
htmlindex.LinkTag: htmlindex.HrefAttribute,
htmlindex.ScriptTag: htmlindex.SrcAttribute,
}

// fixHTMLNodeURLs processes all HTML nodes that contain URLs that need to be fixed
Expand All @@ -51,7 +51,7 @@ func (s *Scraper) fixHTMLNodeURLs(baseURL *url.URL, relativeToRoot string, index
var changed bool

for tag, attribute := range nodeAttributes {
isHyperlink := tag == "a"
isHyperlink := tag == htmlindex.ATag

urls := index.Nodes(tag)
for _, nodes := range urls {
Expand Down
2 changes: 1 addition & 1 deletion scraper/scraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ func (s *Scraper) processURL(ctx context.Context, u *url.URL, currentDepth uint)
// check first and download afterward to not hit max depth limit for
// start page links because of recursive linking
// a hrefs
references, err := index.URLs("a")
references, err := index.URLs(htmlindex.ATag)
if err != nil {
s.logger.Error("Parsing URL failed", log.Err(err))
}
Expand Down

0 comments on commit 3b38968

Please sign in to comment.