Skip to content

Commit

Permalink
Add EstimateUnixFSFileDefaultChunking for estimating how many bytes t…
Browse files Browse the repository at this point in the history
…he car file representing a unixfs file of a given size will take

Fix #58
  • Loading branch information
willscott committed Aug 6, 2023
1 parent 2450f69 commit fe18dab
Show file tree
Hide file tree
Showing 4 changed files with 159 additions and 4 deletions.
86 changes: 86 additions & 0 deletions data/builder/file.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
basicnode "github.com/ipld/go-ipld-prime/node/basic"
"github.com/multiformats/go-multicodec"
multihash "github.com/multiformats/go-multihash/core"
"github.com/multiformats/go-varint"

// raw needed for opening as bytes
_ "github.com/ipld/go-ipld-prime/codec/raw"
Expand Down Expand Up @@ -57,6 +58,91 @@ func BuildUnixFSFile(r io.Reader, chunker string, ls *ipld.LinkSystem) (ipld.Lin
}
}

// EstimateUnixFSFile estimates the byte size of the car file that would be
// needed to hold a UnixFS file containing data of the given length.
func EstimateUnixFSFileDefaultChunking(dataLength uint64) uint64 {
blkSize := chunk.DefaultBlockSize
blocks := dataLength / uint64(blkSize)
remainder := dataLength % uint64(blkSize)

size := dataLength
cidExample, _ := leafLinkProto.Prefix.Sum([]byte{0})
cidLength := uint64(len(cidExample.Bytes()))

links := []uint64{}
for i := uint64(0); i < blocks; i++ {
links = append(links, uint64(chunk.DefaultBlockSize))
}
// account for the uvarint + cid length of each block of raw data.
size += uint64(len(links)) * (cidLength + uint64(varint.UvarintSize(cidLength+uint64(blkSize))))
if remainder > 0 {
links = append(links, remainder)
size += cidLength + uint64(varint.UvarintSize(cidLength+uint64(remainder)))
}

// account for the metadata overhead nodes.
ls := cidlink.DefaultLinkSystem()
storage := cidlink.Memory{}
ls.StorageReadOpener = storage.OpenRead
ls.StorageWriteOpener = storage.OpenWrite

icnt := 0
for len(links) > 1 {
nxtLnks := []uint64{}
for len(links) > 1 {
icnt++
children := uint64(DefaultLinksPerBlock)
if len(links) < DefaultLinksPerBlock {
children = uint64(len(links))
}
childrenLinks := links[:children]
links = links[children:]
totalSize := uint64(0)
for _, l := range childrenLinks {
totalSize += l
}

node, _ := BuildUnixFS(func(b *Builder) {
FileSize(b, totalSize)
BlockSizes(b, childrenLinks)
})

// Pack into the dagpb node.
dpbb := dagpb.Type.PBNode.NewBuilder()
pbm, _ := dpbb.BeginMap(2)
pblb, _ := pbm.AssembleEntry("Links")
pbl, _ := pblb.BeginList(int64(len(childrenLinks)))
for _, c := range childrenLinks {
pbln, _ := BuildUnixFSDirectoryEntry("", int64(c), cidlink.Link{Cid: cidExample})
pbl.AssembleValue().AssignNode(pbln)
}
pbl.Finish()
pbm.AssembleKey().AssignString("Data")
pbm.AssembleValue().AssignBytes(data.EncodeUnixFSData(node))
pbm.Finish()
pbn := dpbb.Build()
pbLnk := ls.MustStore(ipld.LinkContext{}, fileLinkProto, pbn)
pbRcrd, _ := ls.LoadRaw(ipld.LinkContext{}, pbLnk)

// dagpb overhead
intermediateNodeSize := uint64(len(pbRcrd))

size += intermediateNodeSize + cidLength + uint64(varint.UvarintSize(cidLength+intermediateNodeSize))
nxtLnks = append(nxtLnks, totalSize)
}
if len(links) == 1 {
nxtLnks = append(nxtLnks, links[0])
}
links = nxtLnks
}
fmt.Printf("estimated %d intermeidate nodes\n", icnt)

// add the car header
size += 59

return size
}

var fileLinkProto = cidlink.LinkPrototype{
Prefix: cid.Prefix{
Version: 1,
Expand Down
72 changes: 69 additions & 3 deletions data/builder/file_test.go
Original file line number Diff line number Diff line change
@@ -1,16 +1,27 @@
package builder
package builder_test

import (
"bytes"
"context"
"fmt"
"io"
"math/rand"
"testing"

"github.com/ipfs/go-unixfsnode/data/builder"
"github.com/multiformats/go-multicodec"
multihash "github.com/multiformats/go-multihash/core"

"github.com/ipfs/go-cid"
u "github.com/ipfs/go-ipfs-util"
"github.com/ipfs/go-unixfsnode/file"
carv1 "github.com/ipld/go-car"
"github.com/ipld/go-car/v2"
dagpb "github.com/ipld/go-codec-dagpb"
"github.com/ipld/go-ipld-prime"
"github.com/ipld/go-ipld-prime/linking"
cidlink "github.com/ipld/go-ipld-prime/linking/cid"
selectorparse "github.com/ipld/go-ipld-prime/traversal/selector/parse"
)

func TestBuildUnixFSFile(t *testing.T) {
Expand All @@ -23,7 +34,7 @@ func TestBuildUnixFSFile(t *testing.T) {
ls.StorageReadOpener = storage.OpenRead
ls.StorageWriteOpener = storage.OpenWrite

f, _, err := BuildUnixFSFile(r, "", &ls)
f, _, err := builder.BuildUnixFSFile(r, "", &ls)
if err != nil {
t.Fatal(err)
}
Expand All @@ -43,6 +54,61 @@ func TestBuildUnixFSFile(t *testing.T) {
}
}

func TestEstimateUnixFSFileDefaultChunking(t *testing.T) {
for i := 100; i < 1000000000; i *= 10 {
b := make([]byte, i)
rand.Read(b)

ls := cidlink.DefaultLinkSystem()
storage := cidlink.Memory{}
ls.StorageReadOpener = storage.OpenRead
nPB := 0

ls.StorageWriteOpener = func(lc linking.LinkContext) (io.Writer, linking.BlockWriteCommitter, error) {
w, bwc, err := storage.OpenWrite(lc)
return w, func(lnk ipld.Link) error {
if lnk.(cidlink.Link).Cid.Prefix().Codec == uint64(multicodec.DagPb) {
nPB++
}
return bwc(lnk)
}, err
}
rt, _, err := builder.BuildUnixFSFile(bytes.NewReader(b), "", &ls)
if err != nil {
t.Fatal(err)
}

ob := bytes.NewBuffer(nil)
_, err = car.TraverseV1(context.Background(), &ls, rt.(cidlink.Link).Cid, selectorparse.CommonSelector_ExploreAllRecursively, ob)
if err != nil {
t.Fatal(err)
}
fileLen := len(ob.Bytes())

estimate := builder.EstimateUnixFSFileDefaultChunking(uint64(i))
if estimate != uint64(fileLen) {
fmt.Printf("%d intermediate nodes.\n", nPB)
t.Fatalf("estimate for file length %d was %d. should be %d", i, estimate, fileLen)
}
}
}

func TestS(t *testing.T) {
p := cid.Prefix{
Version: 1,
Codec: uint64(multicodec.DagPb),
MhType: multihash.SHA2_256,
MhLength: 32,
}
rt, _ := p.Sum([]byte{0})
ch := carv1.CarHeader{
Roots: []cid.Cid{rt},
Version: 1,
}
s, _ := carv1.HeaderSize(&ch)
t.Fatalf("hs: %d\n", s)
}

func TestUnixFSFileRoundtrip(t *testing.T) {
buf := make([]byte, 10*1024*1024)
u.NewSeededRand(0xdeadbeef).Read(buf)
Expand All @@ -53,7 +119,7 @@ func TestUnixFSFileRoundtrip(t *testing.T) {
ls.StorageReadOpener = storage.OpenRead
ls.StorageWriteOpener = storage.OpenWrite

f, _, err := BuildUnixFSFile(r, "", &ls)
f, _, err := builder.BuildUnixFSFile(r, "", &ls)
if err != nil {
t.Fatal(err)
}
Expand Down
3 changes: 2 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,13 @@ require (
github.com/ipfs/go-ipld-format v0.4.0
github.com/ipfs/go-merkledag v0.10.0
github.com/ipfs/go-unixfs v0.4.4
github.com/ipld/go-car v0.5.0
github.com/ipld/go-car/v2 v2.8.0
github.com/ipld/go-codec-dagpb v1.6.0
github.com/ipld/go-ipld-prime v0.20.0
github.com/multiformats/go-multicodec v0.8.1
github.com/multiformats/go-multihash v0.2.1
github.com/multiformats/go-varint v0.0.7
github.com/spaolacci/murmur3 v1.1.0
github.com/stretchr/testify v1.8.2
google.golang.org/protobuf v1.28.1
Expand Down Expand Up @@ -51,7 +53,6 @@ require (
github.com/multiformats/go-base32 v0.1.0 // indirect
github.com/multiformats/go-base36 v0.2.0 // indirect
github.com/multiformats/go-multibase v0.1.1 // indirect
github.com/multiformats/go-varint v0.0.7 // indirect
github.com/opentracing/opentracing-go v1.2.0 // indirect
github.com/petar/GoLLRB v0.0.0-20210522233825-ae3b015fd3e9 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ github.com/ipfs/go-unixfs v0.4.4 h1:D/dLBOJgny5ZLIur2vIXVQVW0EyDHdOMBDEhgHrt6rY=
github.com/ipfs/go-unixfs v0.4.4/go.mod h1:TSG7G1UuT+l4pNj91raXAPkX0BhJi3jST1FDTfQ5QyM=
github.com/ipfs/go-verifcid v0.0.2 h1:XPnUv0XmdH+ZIhLGKg6U2vaPaRDXb9urMyNVCE7uvTs=
github.com/ipfs/go-verifcid v0.0.2/go.mod h1:40cD9x1y4OWnFXbLNJYRe7MpNvWlMn3LZAG5Wb4xnPU=
github.com/ipld/go-car v0.5.0 h1:kcCEa3CvYMs0iE5BzD5sV7O2EwMiCIp3uF8tA6APQT8=
github.com/ipld/go-car v0.5.0/go.mod h1:ppiN5GWpjOZU9PgpAZ9HbZd9ZgSpwPMr48fGRJOWmvE=
github.com/ipld/go-car/v2 v2.8.0 h1:8tUI+VM1mAQ2Qa7ScK++lfyuZYcGQ70bZ6NpGOcJj5o=
github.com/ipld/go-car/v2 v2.8.0/go.mod h1:a+BnAxUqgr7wcWxW/lI6ctyEQ2v9gjBChPytwFMp2f4=
github.com/ipld/go-codec-dagpb v1.6.0 h1:9nYazfyu9B1p3NAgfVdpRco3Fs2nFC72DqVsMj6rOcc=
Expand Down

0 comments on commit fe18dab

Please sign in to comment.